cvs2svn.py

   1 #!/usr/bin/env python
   2 #
   3 # cvs2svn: ...
   4 #
   5 # ====================================================================
   6 # Copyright (c) 2000-2004 CollabNet.  All rights reserved.
   7 #
   8 # This software is licensed as described in the file COPYING, which
   9 # you should have received as part of this distribution.  The terms
  10 # are also available at http://subversion.tigris.org/license-1.html.
  11 # If newer versions of this license are posted there, you may use a
  12 # newer version instead, at your option.
  13 #
  14 # This software consists of voluntary contributions made by many
  15 # individuals.  For exact contribution history, see the revision
  16 # history and logs, available at http://cvs2svn.tigris.org/.
  17 # ====================================================================
  18
  19 VERSION = 'r' + "$LastChangedRevision$"[22:-2]
  20
  21 import cvs2svn_rcsparse
  22 import os
  23 import sys
  24 import sha
  25 import re
  26 import time
  27 import fileinput
  28 import string
  29 import getopt
  30 import stat
  31 import string
  32 import md5
  33 import marshal
  34 import errno
  35 import popen2
  36
  37 # Warnings and errors start with these strings.  They are typically
  38 # followed by a colon and a space, as in "%s: " ==> "Warning: ".
  39 warning_prefix = "Warning"
  40 error_prefix = "Error"
  41
  42 # Make sure this Python is recent enough.
  43 if sys.hexversion < 0x2000000:
  44   sys.stderr.write("'%s: Python 2.0 or higher required, "
  45                    "see www.python.org.\n" % error_prefix)
  46   sys.exit(1)
  47
  48 # Pretend we have true booleans on older python versions
  49 try:
  50   True
  51 except:
  52   True = 1
  53   False = 0
  54
  55 # Minimal, incomplete, version of popen2.Popen3 for those platforms
  56 # for which popen2 does not provide it.
  57 try:
  58   Popen3 = popen2.Popen3
  59 except AttributeError:
  60   class Popen3:
  61     def __init__(self, cmd, capturestderr):
  62       if type(cmd) != str:
  63         cmd = " ".join(cmd)
  64       self.fromchild, self.tochild, self.childerr = popen2.popen3(cmd,
  65                                                                   mode='b')
  66     def wait(self):
  67       return self.fromchild.close() or self.tochild.close() or \
  68              self.childerr.close()
  69
  70 # DBM module selection
  71
  72 # 1. If we have bsddb3, it is probably newer than bsddb.  Fake bsddb = bsddb3,
  73 #    so that the dbhash module used by anydbm will use bsddb3.
  74 try:
  75   import bsddb3
  76   sys.modules['bsddb'] = sys.modules['bsddb3']
  77 except ImportError:
  78   pass
  79
  80 # 2. These DBM modules are not good for cvs2svn.
  81 import anydbm
  82 if (anydbm._defaultmod.__name__ == 'dumbdbm'
  83     or anydbm._defaultmod.__name__ == 'dbm'):
  84   print 'ERROR: your installation of Python does not contain a suitable'
  85   print '  DBM module. This script cannot continue.'
  86   print '  to solve: see http://python.org/doc/current/lib/module-anydbm.html'
  87   print '  for details.'
  88   sys.exit(1)
  89
  90 # 3. If we are using the old bsddb185 module, then try prefer gdbm instead.
  91 #    Unfortunately, gdbm appears not to be trouble free, either.
  92 if hasattr(anydbm._defaultmod, 'bsddb') \
  93     and not hasattr(anydbm._defaultmod.bsddb, '__version__'):
  94   try:
  95     gdbm = __import__('gdbm')
  96   except ImportError:
  97     sys.stderr.write(warning_prefix +
  98         ': The version of the bsddb module found '
  99         'on your computer has been reported to malfunction on some datasets, '
 100         'causing KeyError exceptions. You may wish to upgrade your Python to '
 101         'version 2.3 or later.\n')
 102   else:
 103     anydbm._defaultmod = gdbm
 104
 105 trunk_rev = re.compile('^[0-9]+\\.[0-9]+$')
 106 branch_tag = re.compile('^[0-9.]+\\.0\\.[0-9]+$')
 107 vendor_tag = re.compile('^[0-9]+\\.[0-9]+\\.[0-9]+$')
 108
 109 # This really only matches standard '1.1.1.*'-style vendor revisions.
 110 # One could conceivably have a file whose default branch is 1.1.3 or
 111 # whatever, or was that at some point in time, with vendor revisions
 112 # 1.1.3.1, 1.1.3.2, etc.  But with the default branch gone now (which
 113 # is the only time this regexp gets used), we'd have no basis for
 114 # assuming that the non-standard vendor branch had ever been the
 115 # default branch anyway, so we don't want this to match them anyway.
 116 vendor_revision = re.compile('^(1\\.1\\.1)\\.([0-9])+$')
 117
 118 DATAFILE = 'cvs2svn-data'
 119 DUMPFILE = 'cvs2svn-dump'  # The "dumpfile" we create to load into the repos
 120
 121 # This text file contains records (1 per line) that describe svn
 122 # filesystem paths that are the opening and closing source revisions
 123 # for copies to tags and branches.  The format is as follows:
 124 #
 125 # SYMBOL_NAME SVN_REVNUM TYPE SVN_PATH
 126 #
 127 # Where type is either OPENING or CLOSING.  The SYMBOL_NAME and
 128 # SVN_REVNUM are the primary and secondary sorting criteria for
 129 # creating SYMBOL_OPENINGS_CLOSINGS_SORTED.
 130 SYMBOL_OPENINGS_CLOSINGS = 'cvs2svn-symbolic-names.txt'
 131 # A sorted version of the above file.
 132 SYMBOL_OPENINGS_CLOSINGS_SORTED = 'cvs2svn-symbolic-names-s.txt'
 133
 134 # This file is a temporary file for storing symbolic_name -> closing
 135 # CVSRevision until the end of our pass where we can look up the
 136 # corresponding SVNRevNum for the closing revs and write these out to
 137 # the SYMBOL_OPENINGS_CLOSINGS.
 138 SYMBOL_CLOSINGS_TMP = 'cvs2svn-symbolic-names-closings-tmp.txt'
 139
 140 # Skeleton version of an svn filesystem.
 141 # (These supersede and will eventually replace the two above.)
 142 # See class SVNRepositoryMirror for how these work.
 143 SVN_MIRROR_REVISIONS_DB = 'cvs2svn-svn-revisions.db'
 144 SVN_MIRROR_NODES_DB = 'cvs2svn-svn-nodes.db'
 145
 146 # Offsets pointing to the beginning of each SYMBOLIC_NAME in
 147 # SYMBOL_OPENINGS_CLOSINGS_SORTED
 148 SYMBOL_OFFSETS_DB = 'cvs2svn-symbolic-name-offsets.db'
 149
 150 # Maps CVSRevision.unique_key()s to lists of symbolic names, where
 151 # the CVSRevision is the last such that is a source for those symbolic
 152 # names.  For example, if branch B's number is 1.3.0.2 in this CVS
 153 # file, and this file's 1.3 is the latest (by date) revision among
 154 # *all* CVS files that is a source for branch B, then the
 155 # CVSRevision.unique_key() corresponding to this file at 1.3 would
 156 # list at least B in its list.
 157 SYMBOL_LAST_CVS_REVS_DB = 'cvs2svn-symbol-last-cvs-revs.db'
 158
 159 # Maps CVSRevision.unique_key() to corresponding line in s-revs.
 160 ###PERF Or, we could map to an offset into s-revs, instead of dup'ing
 161 ### the s-revs data in this database.
 162 CVS_REVS_DB = 'cvs2svn-cvs-revs.db'
 163
 164 # Lists all symbolic names that are tags.  Keys are strings (symbolic
 165 # names), values are ignorable.
 166 TAGS_DB = 'cvs2svn-tags.db'
 167
 168 # A list all tags.  Each line consists of the tag name and the number
 169 # of files in which it exists, separated by a space.
 170 TAGS_LIST = 'cvs2svn-tags.txt'
 171
 172 # A list of all branches.  The file is stored as a plain text file
 173 # to make it easy to look at in an editor.  Each line contains the
 174 # branch name, the number of files where the branch is created, the
 175 # commit count, and a list of tags and branches that are defined on
 176 # revisions in the branch.
 177 BRANCHES_LIST = 'cvs2svn-branches.txt'
 178
 179 # These two databases provide a bidirectional mapping between
 180 # CVSRevision.unique_key()s and Subversion revision numbers.
 181 #
 182 # The first maps CVSRevision.unique_key() to a number; the values are
 183 # not unique.
 184 #
 185 # The second maps a number to a list of CVSRevision.unique_key()s.
 186 CVS_REVS_TO_SVN_REVNUMS = 'cvs2svn-cvs-revs-to-svn-revnums.db'
 187 SVN_REVNUMS_TO_CVS_REVS = 'cvs2svn-svn-revnums-to-cvs-revs.db'
 188
 189 # This database maps svn_revnums to tuples of (symbolic_name, date).
 190 #
 191 # The svn_revnums are the revision numbers of all non-primary
 192 # SVNCommits.  No primary SVNCommit has a key in this database.
 193 #
 194 # The date is stored for all commits in this database.
 195 #
 196 # For commits that fill symbolic names, the symbolic_name is stored.
 197 # For commits that default branch syncs, the symbolic_name is None.
 198 SVN_COMMIT_NAMES_DATES = 'cvs2svn-svn-commit-names-and-dates.db'
 199
 200 # This database maps svn_revnums of a default branch synchronization
 201 # commit to the svn_revnum of the primary SVNCommit that motivated it.
 202 #
 203 # (NOTE: Secondary commits that fill branches and tags also have a
 204 # motivating commit, but we do not record it because it is (currently)
 205 # not needed for anything.)
 206 #
 207 # This mapping is used when generating the log message for the commit
 208 # that synchronizes the default branch with trunk.
 209 MOTIVATING_REVNUMS = 'cvs2svn-svn-motivating-commit-revnums.db'
 210
 211 # How many bytes to read at a time from a pipe.  128 kiB should be
 212 # large enough to be efficient without wasting too much memory.
 213 PIPE_READ_SIZE = 128 * 1024
 214
 215 # Record the default RCS branches, if any, for CVS filepaths.
 216 #
 217 # The keys are CVS filepaths, relative to the top of the repository
 218 # and with the ",v" stripped off, so they match the cvs paths used in
 219 # Commit.commit().  The values are vendor branch revisions, such as
 220 # '1.1.1.1', or '1.1.1.2', or '1.1.1.96'.  The vendor branch revision
 221 # represents the highest vendor branch revision thought to have ever
 222 # been head of the default branch.
 223 #
 224 # The reason we record a specific vendor revision, rather than a
 225 # default branch number, is that there are two cases to handle:
 226 #
 227 # One case is simple.  The RCS file lists a default branch explicitly
 228 # in its header, such as '1.1.1'.  In this case, we know that every
 229 # revision on the vendor branch is to be treated as head of trunk at
 230 # that point in time.
 231 #
 232 # But there's also a degenerate case.  The RCS file does not currently
 233 # have a default branch, yet we can deduce that for some period in the
 234 # past it probably *did* have one.  For example, the file has vendor
 235 # revisions 1.1.1.1 -> 1.1.1.96, all of which are dated before 1.2,
 236 # and then it has 1.1.1.97 -> 1.1.1.100 dated after 1.2.  In this
 237 # case, we should record 1.1.1.96 as the last vendor revision to have
 238 # been the head of the default branch.
 239 DEFAULT_BRANCHES_DB = 'cvs2svn-default-branches.db'
 240
 241 # Records the author and log message for each changeset.
 242 # The keys are author+log digests, the same kind used to identify
 243 # unique revisions in the .revs, etc files.  Each value is a tuple
 244 # of two elements: '(author logmessage)'.
 245 METADATA_DB = "cvs2svn-metadata.db"
 246
 247 REVS_SUFFIX = '.revs'
 248 CLEAN_REVS_SUFFIX = '.c-revs'
 249 SORTED_REVS_SUFFIX = '.s-revs'
 250 RESYNC_SUFFIX = '.resync'
 251
 252 SVN_INVALID_REVNUM = -1
 253
 254 COMMIT_THRESHOLD = 5 * 60       # flush a commit if a 5 minute gap occurs
 255
 256 # Things that can happen to a file.
 257 OP_NOOP   = '-'
 258 OP_ADD    = 'A'
 259 OP_DELETE = 'D'
 260 OP_CHANGE = 'C'
 261
 262 # A deltatext either does or doesn't represent some change.
 263 DELTATEXT_NONEMPTY = 'N'
 264 DELTATEXT_EMPTY    = 'E'
 265
 266 DIGEST_END_IDX = 9 + (sha.digestsize * 2)
 267
 268 # Constants used in SYMBOL_OPENINGS_CLOSINGS
 269 OPENING = 'O'
 270 CLOSING = 'C'
 271
 272 # Officially, CVS symbolic names must use a fairly restricted set of
 273 # characters.  Unofficially, CVS 1.10 allows any character but [$,.:;@]
 274 # We don't care if some repositories out there use characters outside the
 275 # official set, as long as their tags start with a letter.
 276 # Since the unofficial set also includes [/\] we need to translate those
 277 # into ones that don't conflict with Subversion limitations.
 278 symbolic_name_re = re.compile('^[a-zA-Z].*$')
 279
 280 def _clean_symbolic_name(name):
 281   """Return symbolic name NAME, translating characters that Subversion
 282   does not allow in a pathname."""
 283   name = name.replace('/',',')
 284   name = name.replace('\\',';')
 285   return name
 286
 287 def _path_join(*components):
 288   """Join two or more pathname COMPONENTS, inserting '/' as needed.
 289   Empty component are skipped."""
 290   return string.join(filter(None, components), '/')
 291
 292 def run_command(command):
 293   if os.system(command):
 294     sys.exit('Command failed: "%s"' % command)
 295
 296 def relative_name(cvsroot, fname):
 297   l = len(cvsroot)
 298   if fname[:l] == cvsroot:
 299     if fname[l] == os.sep:
 300       return string.replace(fname[l+1:], os.sep, '/')
 301     return string.replace(fname[l:], os.sep, '/')
 302   sys.stderr.write("%s: relative_path('%s', '%s'): fname is not a sub-path of"
 303                    " cvsroot\n" % (error_prefix, cvsroot, fname))
 304   sys.exit(1)
 305
 306 # Return a string that has not been returned by gen_key() before.
 307 gen_key_base = 0L
 308 def gen_key():
 309   global gen_key_base
 310   key = '%x' % gen_key_base
 311   gen_key_base = gen_key_base + 1
 312   return key
 313
 314 if sys.platform == "win32":
 315   def escape_shell_arg(str):
 316     return '"' + string.replace(str, '"', '"^""') + '"'
 317 else:
 318   def escape_shell_arg(str):
 319     return "'" + string.replace(str, "'", "'\\''") + "'"
 320
 321 def format_date(date):
 322   """Return an svn-compatible date string for DATE (seconds since epoch)."""
 323   # A Subversion date looks like "2002-09-29T14:44:59.000000Z"
 324   return time.strftime("%Y-%m-%dT%H:%M:%S.000000Z", time.gmtime(date))
 325
 326 def sort_file(infile, outfile):
 327   # sort the log files
 328
 329   # GNU sort will sort our dates differently (incorrectly!) if our
 330   # LC_ALL is anything but 'C', so if LC_ALL is set, temporarily set
 331   # it to 'C'
 332   if os.environ.has_key('LC_ALL'):
 333     lc_all_tmp = os.environ['LC_ALL']
 334   else:
 335     lc_all_tmp = None
 336   os.environ['LC_ALL'] = 'C'
 337   run_command('sort %s > %s' % (infile, outfile))
 338   if lc_all_tmp is None:
 339     del os.environ['LC_ALL']
 340   else:
 341     os.environ['LC_ALL'] = lc_all_tmp
 342
 343 def print_node_tree(tree, root_node, indent_depth=0):
 344   """For debugging purposes.  Prints all nodes in TREE that are
 345   rooted at ROOT_NODE.  INDENT_DEPTH is merely for purposes of
 346   debugging with the print statement in this function."""
 347   if not indent_depth:
 348     print "TREE", "=" * 75
 349   print "TREE:", " " * (indent_depth * 2), root_node, tree[root_node]
 350   for key, value in tree[root_node].items():
 351     if key[0] == '/': #Skip flags
 352       continue
 353     print_node_tree(tree, value, (indent_depth + 1))
 354
 355 def match_regexp_list(regexp_list, string):
 356   """Return 1 if string matches any of the compiled regexps in REGEXP_LIST,
 357   else return None."""
 358   for regexp in regexp_list:
 359     if regexp.match(string):
 360       return 1
 361
 362 # These constants represent the log levels that this script supports
 363 LOG_WARN = -1
 364 LOG_QUIET = 0
 365 LOG_NORMAL = 1
 366 LOG_VERBOSE = 2
 367 class Log:
 368   """A Simple logging facility.  Each line will be timestamped is
 369   self.use_timestamps is TRUE.  This class is a Borg."""
 370   __shared_state = {}
 371   def __init__(self):
 372     self.__dict__ = self.__shared_state
 373     if self.__dict__:
 374       return
 375     self.log_level = LOG_NORMAL
 376     # Set this to true if you want to see timestamps on each line output.
 377     self.use_timestamps = None
 378     self.logger = sys.stdout
 379
 380   def _timestamp(self):
 381     """Output a detailed timestamp at the beginning of each line output."""
 382     self.logger.write(time.strftime('[%Y-%m-%d %I:%m:%S %Z] - '))
 383
 384   def write(self, log_level, *args):
 385     """This is the public method to use for writing to a file.  Only
 386     messages whose LOG_LEVEL is <= self.log_level will be printed.  If
 387     there are multiple ARGS, they will be separated by a space."""
 388     if log_level > self.log_level:
 389       return
 390     if self.use_timestamps:
 391       self._timestamp()
 392     self.logger.write(' '.join(map(str,args)) + "\n")
 393
 394
 395 class Cleanup:
 396   """This singleton class manages any files created by cvs2svn.  When
 397   you first create a file, call Cleanup.register, passing the
 398   filename, and the last pass that you need the file.  After the end
 399   of that pass, your file will be cleaned up after running an optional
 400   callback.  This class is a Borg."""
 401
 402   __shared_state = {}
 403   def __init__(self):
 404     self.__dict__ = self.__shared_state
 405     if self.__dict__:
 406       return
 407     self._log = {}
 408     self._callbacks = {}
 409
 410   def register(self, file, which_pass, callback=None):
 411     """Register FILE for cleanup at the end of WHICH_PASS, running
 412     function CALLBACK prior to removal.  Registering a given FILE is
 413     idempotent; you may register as many times as you wish, but it
 414     will only be cleaned up once.
 415
 416     Note that if a file is registered multiple times, only the first
 417     callback registered for that file will be called at cleanup
 418     time.  Also note that if you register a database file you must
 419     close the database before cleanup, e.g. using a callback."""
 420     if not self._log.has_key(which_pass):
 421       self._log[which_pass] = {}
 422     self._log[which_pass][file] = 1
 423     if callback and not self._callbacks.has_key(file):
 424       self._callbacks[file] = callback
 425
 426   def cleanup(self, which_pass):
 427     """Clean up all files, and invoke callbacks, for pass WHICH_PASS."""
 428     if not self._log.has_key(which_pass):
 429       return
 430     for file in self._log[which_pass].keys():
 431       Log().write(LOG_VERBOSE, "Deleting", file)
 432       if self._callbacks.has_key(file):
 433         self._callbacks[file]()
 434       os.unlink(file)
 435
 436
 437 # Always use these constants for opening databases.
 438 DB_OPEN_READ = 'r'
 439 DB_OPEN_NEW = 'n'
 440
 441 # A wrapper for anydbm that uses the marshal module to store items as
 442 # strings.
 443 class Database:
 444   def __init__(self, filename, mode):
 445     # pybsddb3 has a bug which prevents it from working with
 446     # Berkeley DB 4.2 if you open the db with 'n' ("new").  This
 447     # causes the DB_TRUNCATE flag to be passed, which is disallowed
 448     # for databases protected by lock and transaction support
 449     # (bsddb databases use locking from bsddb version 4.2.4 onwards).
 450     #
 451     # Therefore, manually perform the removal (we can do this, because
 452     # we know that for bsddb - but *not* anydbm in general - the database
 453     # consists of one file with the name we specify, rather than several
 454     # based on that name).
 455     if mode == 'n' and anydbm._defaultmod.__name__ == 'dbhash':
 456       if os.path.isfile(filename):
 457         os.unlink(filename)
 458       mode = 'c'
 459
 460     self.db = anydbm.open(filename, mode)
 461
 462   def has_key(self, key):
 463     return self.db.has_key(key)
 464
 465   def __getitem__(self, key):
 466     return marshal.loads(self.db[key])
 467
 468   def __setitem__(self, key, value):
 469     self.db[key] = marshal.dumps(value)
 470
 471   def __delitem__(self, key):
 472     del self.db[key]
 473
 474   def get(self, key, default):
 475     if self.has_key(key):
 476       return self.__getitem__(key)
 477     return default
 478
 479   def len(self):
 480     return len(self.db)
 481
 482
 483 class LastSymbolicNameDatabase:
 484   """ Passing every CVSRevision in s-revs to this class will result in
 485   a Database whose key is the last CVS Revision a symbolicname was
 486   seen in, and whose value is a list of all symbolicnames that were
 487   last seen in that revision."""
 488   def __init__(self, mode):
 489     self.symbols = {}
 490     self.symbol_revs_db = Database(SYMBOL_LAST_CVS_REVS_DB, mode)
 491     Cleanup().register(SYMBOL_LAST_CVS_REVS_DB, pass5)
 492
 493   # Once we've gone through all the revs,
 494   # symbols.keys() will be a list of all tags and branches, and
 495   # their corresponding values will be a key into the last CVS revision
 496   # that they were used in.
 497   def log_revision(self, c_rev):
 498     # Gather last CVS Revision for symbolic name info and tag info
 499     for tag in c_rev.tags:
 500       self.symbols[tag] = c_rev.unique_key()
 501     if c_rev.op is not OP_DELETE:
 502       for branch in c_rev.branches:
 503         self.symbols[branch] = c_rev.unique_key()
 504
 505   # Creates an inversion of symbols above--a dictionary of lists (key
 506   # = CVS rev unique_key: val = list of symbols that close in that
 507   # rev.
 508   def create_database(self):
 509     for sym, rev_unique_key in self.symbols.items():
 510       if self.symbol_revs_db.has_key(rev_unique_key):
 511         ary = self.symbol_revs_db[rev_unique_key]
 512         ary.append(sym)
 513         self.symbol_revs_db[rev_unique_key] = ary
 514       else:
 515         self.symbol_revs_db[rev_unique_key] = [sym]
 516
 517
 518 class CVSRevisionDatabase:
 519   """A Database to store CVSRevision objects and retrieve them by their
 520   unique_key()."""
 521
 522   def __init__(self, mode):
 523     """Initialize an instance, opening database in MODE (like the MODE
 524     argument to Database or anydbm.open())."""
 525     self.cvs_revs_db = Database(CVS_REVS_DB, mode)
 526     Cleanup().register(CVS_REVS_DB, pass8)
 527
 528   def log_revision(self, c_rev):
 529     """Add C_REV, a CVSRevision, to the database."""
 530     self.cvs_revs_db[c_rev.unique_key()] = str(c_rev)
 531
 532   def get_revision(self, unique_key):
 533     """Return the CVSRevision stored under UNIQUE_KEY."""
 534     return CVSRevision(Ctx(), self.cvs_revs_db[unique_key])
 535
 536
 537 class TagsDatabase(Database):
 538   """A Database to store which symbolic names are tags.
 539   Each key is a tag name.
 540   The value has no meaning, and should be set to None."""
 541   def __init__(self, mode):
 542     Database.__init__(self, TAGS_DB, mode)
 543     Cleanup().register(TAGS_DB, pass8)
 544
 545
 546 class CVSRevision:
 547   def __init__(self, ctx, *args):
 548     """Initialize a new CVSRevision with Ctx object CTX, and ARGS.
 549
 550     If CTX is None, the following members and methods of the
 551     instantiated CVSRevision class object will be unavailable (or
 552     simply will not work correctly, if at all):
 553        cvs_path
 554        svn_path
 555        svn_trunk_path
 556        is_default_branch_revision()
 557
 558     (Note that this class treats CTX as const, because the caller
 559     likely passed in a Borg instance of a Ctx.  The reason this class
 560     takes CTX as as a parameter, instead of just instantiating a Ctx
 561     itself, is that this class should be usable outside cvs2svn.py.)
 562
 563     If there is one argument in ARGS, it is a string, in the format of
 564     a line from a revs file.  Do *not* include a trailing newline.
 565
 566     If there are multiple ARGS, there must be 15 of them,
 567     comprising a parsed revs line:
 568        timestamp       -->  (int) date stamp for this cvs revision
 569        digest          -->  (string) digest of author+logmsg
 570        op              -->  (char) OP_ADD, OP_CHANGE, or OP_DELETE
 571        prev_rev        -->  (string or None) previous CVS rev, e.g., "1.2"
 572        rev             -->  (string) this CVS rev, e.g., "1.3"
 573        next_rev        -->  (string or None) next CVS rev, e.g., "1.4"
 574        file_in_attic   -->  (char or None) true if RCS file is in Attic
 575        file_executable -->  (char or None) true if RCS file has exec bit set.
 576        file_size       -->  (int) size of the RCS file
 577        deltatext_code  -->  (char) 'N' if non-empty deltatext, else 'E'
 578        mode            -->  (string or None) "kkv", "kb", etc.
 579        branch_name     -->  (string or None) branch on which this rev occurred
 580        tags            -->  (list of strings) all tags on this revision
 581        branches        -->  (list of strings) all branches rooted in this rev
 582        fname           -->  (string) relative path of file in CVS repos
 583
 584     The two forms of initialization are equivalent."""
 585
 586     self._ctx = ctx
 587     if len(args) == 15:
 588       (self.timestamp, self.digest, self.op, self.prev_rev, self.rev,
 589        self.next_rev, self.file_in_attic, self.file_executable,
 590        self.file_size, self.deltatext_code, self.fname,
 591        self.mode, self.branch_name, self.tags, self.branches) = args
 592     elif len(args) == 1:
 593       data = args[0].split(' ', 13)
 594       self.timestamp = int(data[0], 16)
 595       self.digest = data[1]
 596       self.op = data[2]
 597       self.prev_rev = data[3]
 598       if self.prev_rev == "*":
 599         self.prev_rev = None
 600       self.rev = data[4]
 601       self.next_rev = data[5]
 602       if self.next_rev == "*":
 603         self.next_rev = None
 604       self.file_in_attic = data[6]
 605       if self.file_in_attic == "*":
 606         self.file_in_attic = None
 607       self.file_executable = data[7]
 608       if self.file_executable == "*":
 609         self.file_executable = None
 610       self.file_size = int(data[8])
 611       self.deltatext_code = data[9]
 612       self.mode = data[10]
 613       if self.mode == "*":
 614         self.mode = None
 615       self.branch_name = data[11]
 616       if self.branch_name == "*":
 617         self.branch_name = None
 618       ntags = int(data[12])
 619       tags = data[13].split(' ', ntags + 1)
 620       nbranches = int(tags[ntags])
 621       branches = tags[ntags + 1].split(' ', nbranches)
 622       self.fname = branches[nbranches]
 623       self.tags = tags[:ntags]
 624       self.branches = branches[:nbranches]
 625     else:
 626       raise TypeError, 'CVSRevision() takes 2 or 12 arguments (%d given)' % \
 627           (len(args) + 1)
 628     if ctx is not None:
 629       self.cvs_path = relative_name(self._ctx.cvsroot, self.fname[:-2])
 630       self.svn_path = self._make_path(self.cvs_path, self.branch_name)
 631       self.svn_trunk_path = self._make_path(self.cvs_path)
 632
 633   # The 'primary key' of a CVS Revision is the revision number + the
 634   # filename.  To provide a unique key (say, for a dict), we just glom
 635   # them together in a string.  By passing in self.prev_rev or
 636   # self.next_rev, you can get the unique key for their respective
 637   # CVSRevisions.
 638   def unique_key(self, revnum=None):
 639     if revnum is None:
 640       revnum = self.rev
 641     return revnum + "/" + self.fname
 642
 643   def __str__(self):
 644     return ('%08lx %s %s %s %s %s %s %s %d %s %s %s %d%s%s %d%s%s %s' % (
 645       self.timestamp, self.digest, self.op,
 646       (self.prev_rev or "*"), self.rev, (self.next_rev or "*"),
 647       (self.file_in_attic or "*"), (self.file_executable or "*"),
 648       self.file_size,
 649       self.deltatext_code, (self.mode or "*"), (self.branch_name or "*"),
 650       len(self.tags), self.tags and " " or "", " ".join(self.tags),
 651       len(self.branches), self.branches and " " or "", " ".join(self.branches),
 652       self.fname, ))
 653
 654   # Returns true if this CVSRevision is the opening CVSRevision for
 655   # NAME (for this RCS file).
 656   def opens_symbolic_name(self, name):
 657     if name in self.tags:
 658       return 1
 659     if name in self.branches:
 660       return 1
 661     return 0
 662
 663   def is_default_branch_revision(self):
 664     """Return 1 if SELF.rev of SELF.cvs_path is a default branch
 665     revision according to DEFAULT_BRANCHES_DB (see the conditions
 666     documented there), else return None."""
 667     if self._ctx._default_branches_db.has_key(self.cvs_path):
 668       val = self._ctx._default_branches_db[self.cvs_path]
 669       val_last_dot = val.rindex(".")
 670       our_last_dot = self.rev.rindex(".")
 671       default_branch = val[:val_last_dot]
 672       our_branch = self.rev[:our_last_dot]
 673       default_rev_component = int(val[val_last_dot + 1:])
 674       our_rev_component = int(self.rev[our_last_dot + 1:])
 675       if (default_branch == our_branch
 676           and our_rev_component <= default_rev_component):
 677         return 1
 678     # else
 679     return None
 680
 681   def _make_path(self, path, branch_name = None):
 682     """Return the trunk path or branch path for PATH.
 683
 684     If PATH is None, return None."""
 685     # For a while, we treated each top-level subdir of the CVS
 686     # repository as a "project root" and interpolated the appropriate
 687     # genealogy (trunk|tag|branch) in according to the official
 688     # recommended layout.  For example, the path '/foo/bar/baz.c' on
 689     # branch 'Rel2' would become
 690     #
 691     #   /foo/branches/Rel2/bar/baz.c
 692     #
 693     # and on trunk it would become
 694     #
 695     #   /foo/trunk/bar/baz.c
 696     #
 697     # However, we went back to the older and simpler method of just
 698     # prepending the genealogy to the front, instead of interpolating.
 699     # So now we produce:
 700     #
 701     #   /branches/Rel2/foo/bar/baz.c
 702     #   /trunk/foo/bar/baz.c
 703     #
 704     # Why?  Well, Jack Repenning pointed out that this way is much
 705     # friendlier to "anonymously rooted subtrees" (that's a tree where
 706     # the name of the top level dir doesn't matter, the point is that if
 707     # you cd into it and, say, run 'make', something good will happen).
 708     # By interpolating, we made it impossible to point cvs2svn at some
 709     # subdir in the CVS repository and convert it as a project, because
 710     # we'd treat every subdir underneath it as an independent project
 711     # root, which is probably not what the user wanted.
 712     #
 713     # Also, see Blair Zajac's post
 714     #
 715     #    http://subversion.tigris.org/servlets/ReadMsg?list=dev&msgNo=38965
 716     #
 717     # and the surrounding thread, for why what people really want is a
 718     # way of specifying an in-repository prefix path, not interpolation.
 719     if path is None:
 720       return None
 721
 722     if branch_name:
 723       branch_name = _clean_symbolic_name(branch_name)
 724       return self._ctx.branches_base + '/' + branch_name + '/' + path
 725     else:
 726       return self._ctx.trunk_base + '/' + path
 727
 728   def rcs_path(self):
 729     """Returns the actual filesystem path to the RCS file of this
 730     CVSRevision."""
 731     if self.file_in_attic is None:
 732       return self.fname
 733     else:
 734       basepath, filename = os.path.split(self.fname)
 735       return os.path.join(basepath, 'Attic', filename)
 736
 737   def filename(self):
 738     "Return the last path component of self.fname, minus the ',v'"
 739     return os.path.split(self.fname)[-1][:-2]
 740
 741 class SymbolDatabase:
 742   """This database records information on all symbols in the RCS
 743   files.  It is created in pass 1 and it is used in pass 2."""
 744   def __init__(self):
 745     # A hash that maps tag names to commit counts
 746     self.tags = { }
 747     # A hash that maps branch names to lists of the format
 748     # [ create_count, commit_count, blockers ], where blockers
 749     # is a hash that lists the symbols that depend on the
 750     # the branch.  The blockers hash is used as a set, so the
 751     # values are not used.
 752     self.branches = { }
 753
 754   def register_tag_creation(self, name):
 755     """Register the creation of the tag NAME."""
 756     if not self.tags.has_key(name):
 757       self.tags[name] = 0
 758     self.tags[name] += 1
 759
 760   def _branch(self, name):
 761     """Helper function to get a branch node that will create and
 762     initialize the node if it does not exist."""
 763     if not self.branches.has_key(name):
 764       self.branches[name] = [ 0, 0, { } ]
 765     return self.branches[name]
 766
 767   def register_branch_creation(self, name):
 768     """Register the creation of the branch NAME."""
 769     self._branch(name)[0] += 1
 770
 771   def register_branch_commit(self, name):
 772     """Register a commit on the branch NAME."""
 773     self._branch(name)[1] += 1
 774
 775   def register_branch_blocker(self, name, blocker):
 776     """Register BLOCKER as a blocker on the branch NAME."""
 777     self._branch(name)[2][blocker] = None
 778
 779   def branch_has_commit(self, name):
 780     """Return non-zero if NAME has commits.  Returns 0 if name
 781     is not a branch or if it has no commits."""
 782     return self.branches.has_key(name) and self.branches[name][1]
 783
 784   def find_excluded_symbols(self, regexp_list):
 785     """Returns a hash of all symbols thaht match the regexps in
 786     REGEXP_LISTE.  The hash is used as a set so the values are
 787     not used."""
 788     excludes = { }
 789     for tag in self.tags.keys():
 790       if match_regexp_list(regexp_list, tag):
 791         excludes[tag] = None
 792     for branch in self.branches.keys():
 793       if match_regexp_list(regexp_list, branch):
 794         excludes[branch] = None
 795     return excludes
 796
 797   def find_branch_exclude_blockers(self, branch, excludes):
 798     """Find all blockers of BRANCH, excluding the ones in the hash
 799     EXCLUDES."""
 800     blockers = { }
 801     if excludes.has_key(branch):
 802       for blocker in self.branches[branch][2]:
 803         if not excludes.has_key(blocker):
 804           blockers[blocker] = None
 805     return blockers
 806
 807   def find_blocked_excludes(self, excludes):
 808     """Find all branches not in EXCLUDES that have blocking symbols that
 809     are not themselves excluded.  Return a hash that maps branch names
 810     to a hash of blockers.  The hash of blockes is used as a set so the
 811     values are not used."""
 812     blocked_branches = { }
 813     for branch in self.branches.keys():
 814       blockers = self.find_branch_exclude_blockers(branch, excludes)
 815       if blockers:
 816         blocked_branches[branch] = blockers
 817     return blocked_branches
 818
 819   def find_mismatches(self, excludes=None):
 820     """Find all symbols that are defined as both tags and branches,
 821     excluding the ones in EXCLUDES.  Returns a list of 4-tuples with
 822     the symbol name, tag count, branch count and commit count."""
 823     if excludes is None:
 824       excludes = { }
 825     mismatches = [ ]
 826     for branch in self.branches.keys():
 827       if not excludes.has_key(branch) and self.tags.has_key(branch):
 828         mismatches.append((branch,                    # name
 829                            self.tags[branch],         # tag count
 830                            self.branches[branch][0],  # branch count
 831                            self.branches[branch][1])) # commit count
 832     return mismatches
 833
 834   def read(self):
 835     """Read the symbol database from files."""
 836     f = open(TAGS_LIST)
 837     while 1:
 838       line = f.readline()
 839       if not line:
 840         break
 841       tag, count = line.split()
 842       self.tags[tag] = int(count)
 843
 844     f = open(BRANCHES_LIST)
 845     while 1:
 846       line = f.readline()
 847       if not line:
 848         break
 849       words = line.split()
 850       self.branches[words[0]] = [ int(words[1]), int(words[2]), { } ]
 851       for blocker in words[3:]:
 852         self.branches[words[0]][2][blocker] = None
 853
 854   def write(self):
 855     """Store the symbol database to files."""
 856     f = open(TAGS_LIST, "w")
 857     for tag, count in self.tags.items():
 858       f.write("%s %d\n" % (tag, count))
 859
 860     f = open(BRANCHES_LIST, "w")
 861     for branch, info in self.branches.items():
 862       f.write("%s %d %d" % (branch, info[0], info[1]))
 863       if info[2]:
 864         f.write(" ")
 865         f.write(" ".join(info[2].keys()))
 866       f.write("\n")
 867
 868 class CollectData(cvs2svn_rcsparse.Sink):
 869   def __init__(self):
 870     self.cvsroot = Ctx().cvsroot
 871     self.revs = open(DATAFILE + REVS_SUFFIX, 'w')
 872     Cleanup().register(DATAFILE + REVS_SUFFIX, pass2)
 873     self.resync = open(DATAFILE + RESYNC_SUFFIX, 'w')
 874     Cleanup().register(DATAFILE + RESYNC_SUFFIX, pass2)
 875     self.default_branches_db = Database(DEFAULT_BRANCHES_DB, DB_OPEN_NEW)
 876     Cleanup().register(DEFAULT_BRANCHES_DB, pass5)
 877     self.metadata_db = Database(METADATA_DB, DB_OPEN_NEW)
 878     Cleanup().register(METADATA_DB, pass8)
 879     self.fatal_errors = []
 880     self.num_files = 0
 881     self.symbol_db = SymbolDatabase()
 882
 883     # 1 if we've collected data for at least one file, None otherwise.
 884     self.found_valid_file = None
 885
 886     # See set_fname() for initializations of other variables.
 887
 888   def set_fname(self, canonical_name, filename):
 889     """Prepare to receive data for FILENAME.  FILENAME is the absolute
 890     filesystem path to the file in question, and CANONICAL_NAME is
 891     FILENAME with the 'Attic' component removed (if the file is indeed
 892     in the Attic) ."""
 893     self.fname = canonical_name
 894
 895     # We calculate and save some file metadata here, where we can do
 896     # it only once per file, instead of waiting until later where we
 897     # would have to do the same calculations once per CVS *revision*.
 898
 899     # If the paths are not the same, then that means that the
 900     # canonical_name has had the 'Attic' component stripped out.
 901     self.file_in_attic = None
 902     if not canonical_name == filename:
 903       self.file_in_attic = 1
 904
 905     file_stat = os.stat(filename)
 906     # The size of our file in bytes
 907     self.file_size = file_stat[stat.ST_SIZE]
 908
 909     # Whether or not the executable bit is set.
 910     self.file_executable = None
 911     if file_stat[0] & stat.S_IXUSR:
 912       self.file_executable = 1
 913
 914     # revision -> [timestamp, author, old-timestamp]
 915     self.rev_data = { }
 916
 917     # Maps revision number (key) to the revision number of the
 918     # previous revision along this line of development.
 919     #
 920     # For the first revision R on a branch, we consider the revision
 921     # from which R sprouted to be the 'previous'.
 922     #
 923     # Note that this revision can't be determined arithmetically (due
 924     # to cvsadmin -o, which is why this is necessary).
 925     self.prev_rev = { }
 926
 927     # This dict is essentially self.prev_rev with the values mapped in
 928     # the other direction, so following key -> value will yield you
 929     # the next revision number
 930     self.next_rev = { }
 931
 932     # Track the state of each revision so that in set_revision_info,
 933     # we can determine if our op is an add/change/delete.  We can do
 934     # this because in set_revision_info, we'll have all of the
 935     # revisions for a file at our fingertips, and we need to examine
 936     # the state of our prev_rev to determine if we're an add or a
 937     # change--without the state of the prev_rev, we are unable to
 938     # distinguish between an add and a change.
 939     self.rev_state = { }
 940
 941     # Hash mapping branch numbers, like '1.7.2', to branch names,
 942     # like 'Release_1_0_dev'.
 943     self.branch_names = { }
 944
 945     # RCS flags (used for keyword expansion).
 946     self.mode = None
 947
 948     # Hash mapping revision numbers, like '1.7', to lists of names
 949     # indicating which branches sprout from that revision, like
 950     # ['Release_1_0_dev', 'experimental_driver', ...].
 951     self.branchlist = { }
 952
 953     # Like self.branchlist, but the values are lists of tag names that
 954     # apply to the key revision.
 955     self.taglist = { }
 956
 957     # If set, this is an RCS branch number -- rcsparse calls this the
 958     # "principal branch", but CVS and RCS refer to it as the "default
 959     # branch", so that's what we call it, even though the rcsparse API
 960     # setter method is still 'set_principal_branch'.
 961     self.default_branch = None
 962
 963     # If the RCS file doesn't have a default branch anymore, but does
 964     # have vendor revisions, then we make an educated guess that those
 965     # revisions *were* the head of the default branch up until the
 966     # commit of 1.2, at which point the file's default branch became
 967     # trunk.  This records the date at which 1.2 was committed.
 968     self.first_non_vendor_revision_date = None
 969
 970   def set_principal_branch(self, branch):
 971     self.default_branch = branch
 972
 973   def set_expansion(self, mode):
 974     self.mode = mode
 975
 976   def set_branch_name(self, branch_number, name):
 977     """Record that BRANCH_NUMBER is the branch number for branch NAME,
 978     and that NAME sprouts from BRANCH_NUMBER .
 979     BRANCH_NUMBER is an RCS branch number with an odd number of components,
 980     for example '1.7.2' (never '1.7.0.2')."""
 981     if not self.branch_names.has_key(branch_number):
 982       self.branch_names[branch_number] = name
 983       # The branchlist is keyed on the revision number from which the
 984       # branch sprouts, so strip off the odd final component.
 985       sprout_rev = branch_number[:branch_number.rfind(".")]
 986       if not self.branchlist.has_key(sprout_rev):
 987         self.branchlist[sprout_rev] = []
 988       self.branchlist[sprout_rev].append(name)
 989       self.symbol_db.register_branch_creation(name)
 990     else:
 991       sys.stderr.write("%s: in '%s':\n"
 992                        "   branch '%s' already has name '%s',\n"
 993                        "   cannot also have name '%s', ignoring the latter\n"
 994                        % (warning_prefix, self.fname, branch_number,
 995                           self.branch_names[branch_number], name))
 996
 997   def rev_to_branch_name(self, revision):
 998     """Return the name of the branch on which REVISION lies.
 999     REVISION is a non-branch revision number with an even number of,
1000     components, for example '1.7.2.1' (never '1.7.2' nor '1.7.0.2').
1001     For the convenience of callers, REVISION can also be a trunk
1002     revision such as '1.2', in which case just return None."""
1003     if trunk_rev.match(revision):
1004       return None
1005     return self.branch_names.get(revision[:revision.rindex(".")])
1006
1007   def add_cvs_branch(self, revision, branch_name):
1008     """Record the root revision and branch revision for BRANCH_NAME,
1009     based on REVISION.  REVISION is a CVS branch number having an even
1010     number of components where the second-to-last is '0'.  For
1011     example, if it's '1.7.0.2', then record that BRANCH_NAME sprouts
1012     from 1.7 and has branch number 1.7.2."""
1013     last_dot = revision.rfind(".")
1014     branch_rev = revision[:last_dot]
1015     last2_dot = branch_rev.rfind(".")
1016     branch_rev = branch_rev[:last2_dot] + revision[last_dot:]
1017     self.set_branch_name(branch_rev, branch_name)
1018
1019   def define_tag(self, name, revision):
1020     """Record a bidirectional mapping between symbolic NAME and REVISION.
1021     REVISION is an unprocessed revision number from the RCS file's
1022     header, for example: '1.7', '1.7.0.2', or '1.1.1' or '1.1.1.1'.
1023     This function will determine what kind of symbolic name it is by
1024     inspection, and record it in the right places."""
1025     if not symbolic_name_re.match(name):
1026       sys.stderr.write("%s: in '%s':\n"
1027                        "   '%s' is not a valid tag or branch name, ignoring\n"
1028                        % (warning_prefix, self.fname, name))
1029       return
1030
1031     if branch_tag.match(revision):
1032       self.add_cvs_branch(revision, name)
1033     elif vendor_tag.match(revision):
1034       self.set_branch_name(revision, name)
1035     else:
1036       if not self.taglist.has_key(revision):
1037         self.taglist[revision] = []
1038       self.taglist[revision].append(name)
1039       self.symbol_db.register_tag_creation(name)
1040
1041   def define_revision(self, revision, timestamp, author, state,
1042                       branches, next):
1043
1044     # Record the state of our revision for later calculations
1045     self.rev_state[revision] = state
1046
1047     # store the rev_data as a list in case we have to jigger the timestamp
1048     self.rev_data[revision] = [int(timestamp), author, None]
1049
1050     # When on trunk, the RCS 'next' revision number points to what
1051     # humans might consider to be the 'previous' revision number.  For
1052     # example, 1.3's RCS 'next' is 1.2.
1053     #
1054     # However, on a branch, the RCS 'next' revision number really does
1055     # point to what humans would consider to be the 'next' revision
1056     # number.  For example, 1.1.2.1's RCS 'next' would be 1.1.2.2.
1057     #
1058     # In other words, in RCS, 'next' always means "where to find the next
1059     # deltatext that you need this revision to retrieve.
1060     #
1061     # That said, we don't *want* RCS's behavior here, so we determine
1062     # whether we're on trunk or a branch and set self.prev_rev
1063     # accordingly.
1064     #
1065     # One last thing.  Note that if REVISION is a branch revision,
1066     # instead of mapping REVISION to NEXT, we instead map NEXT to
1067     # REVISION.  Since we loop over all revisions in the file before
1068     # doing anything with the data we gather here, this 'reverse
1069     # assignment' effectively does the following:
1070     #
1071     # 1. Gives us no 'prev' value for REVISION (in this
1072     # iteration... it may have been set in a previous iteration)
1073     #
1074     # 2. Sets the 'prev' value for the revision with number NEXT to
1075     # REVISION.  So when we come around to the branch revision whose
1076     # revision value is NEXT, its 'prev' and 'prev_rev' are already
1077     # set.
1078     if trunk_rev.match(revision):
1079       self.prev_rev[revision] = next
1080       self.next_rev[next] = revision
1081     elif next:
1082       self.prev_rev[next] = revision
1083       self.next_rev[revision] = next
1084
1085     for b in branches:
1086       self.prev_rev[b] = revision
1087
1088     # Ratchet up the highest vendor head revision, if necessary.
1089     if self.default_branch:
1090       if revision.find(self.default_branch + ".") == 0:
1091         # This revision is on the default branch, so record that it is
1092         # the new highest vendor head revision.
1093         rel_name = relative_name(self.cvsroot, self.fname)[:-2]
1094         self.default_branches_db[rel_name] = revision
1095     else:
1096       # No default branch, so make an educated guess.
1097       if revision == '1.2':
1098         # This is probably the time when the file stopped having a
1099         # default branch, so make a note of it.
1100         self.first_non_vendor_revision_date = timestamp
1101       else:
1102         m = vendor_revision.match(revision)
1103         if m and ((not self.first_non_vendor_revision_date)
1104                   or (timestamp < self.first_non_vendor_revision_date)):
1105           # We're looking at a vendor revision, and it wasn't
1106           # committed after this file lost its default branch, so bump
1107           # the maximum trunk vendor revision in the permanent record.
1108           rel_name = relative_name(self.cvsroot, self.fname)[:-2]
1109           self.default_branches_db[rel_name] = revision
1110
1111     if not trunk_rev.match(revision):
1112       # Check for unlabeled branches, record them.  We tried to collect
1113       # all branch names when we parsed the symbolic name header
1114       # earlier, of course, but that didn't catch unlabeled branches.
1115       # If a branch is unlabeled, this is our first encounter with it,
1116       # so we have to record its data now.
1117       branch_number = revision[:revision.rindex(".")]
1118       if not self.branch_names.has_key(branch_number):
1119         branch_name = "unlabeled-" + branch_number
1120         self.set_branch_name(branch_number, branch_name)
1121
1122       # Register the commit on this non-trunk branch
1123       branch_name = self.branch_names[branch_number]
1124       self.symbol_db.register_branch_commit(branch_name)
1125
1126   def tree_completed(self):
1127     "The revision tree has been parsed.  Analyze it for consistency."
1128
1129     # Our algorithm depends upon the timestamps on the revisions occuring
1130     # monotonically over time.  That is, we want to see rev 1.34 occur in
1131     # time before rev 1.35.  If we inserted 1.35 *first* (due to the time-
1132     # sorting), and then tried to insert 1.34, we'd be screwed.
1133
1134     # to perform the analysis, we'll simply visit all of the 'previous'
1135     # links that we have recorded and validate that the timestamp on the
1136     # previous revision is before the specified revision
1137
1138     # if we have to resync some nodes, then we restart the scan. just keep
1139     # looping as long as we need to restart.
1140     while 1:
1141       for current, prev in self.prev_rev.items():
1142         if not prev:
1143           # no previous revision exists (i.e. the initial revision)
1144           continue
1145         t_c = self.rev_data[current][0]
1146         t_p = self.rev_data[prev][0]
1147         if t_p >= t_c:
1148           # the previous revision occurred later than the current revision.
1149           # shove the previous revision back in time (and any before it that
1150           # may need to shift).
1151           while t_p >= t_c:
1152             self.rev_data[prev][0] = t_c - 1    # new timestamp
1153             self.rev_data[prev][2] = t_p        # old timestamp
1154
1155             msg =  "RESYNC: '%s' (%s): old time='%s' delta=%ds" \
1156                   % (relative_name(self.cvsroot, self.fname),
1157                      prev, time.ctime(t_p), t_c - 1 - t_p)
1158             Log().write(LOG_VERBOSE, msg)
1159
1160             current = prev
1161             prev = self.prev_rev[current]
1162             if not prev:
1163               break
1164             t_c = t_c - 1               # self.rev_data[current][0]
1165             t_p = self.rev_data[prev][0]
1166
1167           # break from the for-loop
1168           break
1169       else:
1170         # finished the for-loop (no resyncing was performed)
1171         return
1172
1173   def set_revision_info(self, revision, log, text):
1174     timestamp, author, old_ts = self.rev_data[revision]
1175     digest = sha.new(log + '\0' + author).hexdigest()
1176     if old_ts:
1177       # the timestamp on this revision was changed. log it for later
1178       # resynchronization of other files's revisions that occurred
1179       # for this time and log message.
1180       self.resync.write('%08lx %s %08lx\n' % (old_ts, digest, timestamp))
1181
1182     # "...Give back one kadam to honor the Hebrew God whose Ark this is."
1183     #       -- Imam to Indy and Sallah, in 'Raiders of the Lost Ark'
1184     #
1185     # If revision 1.1 appears to have been created via 'cvs add'
1186     # instead of 'cvs import', then this file probably never had a
1187     # default branch, so retroactively remove its record in the
1188     # default branches db.  The test is that the log message CVS uses
1189     # for 1.1 in imports is "Initial revision\n" with no period.
1190     if revision == '1.1' and log != 'Initial revision\n':
1191       rel_name = relative_name(self.cvsroot, self.fname)[:-2]
1192       if self.default_branches_db.has_key(rel_name):
1193         del self.default_branches_db[rel_name]
1194
1195     # How to tell if a CVSRevision is an add, a change, or a deletion:
1196     #
1197     # It's a delete if RCS state is 'dead'
1198     #
1199     # It's an add if RCS state is 'Exp.' and
1200     #      - we either have no previous revision
1201     #        or
1202     #      - we have a previous revision whose state is 'dead'
1203     #
1204     # Anything else is a change.
1205     if self.rev_state[revision] == 'dead':
1206       op = OP_DELETE
1207     elif ((self.prev_rev.get(revision, None) is None)
1208           or (self.rev_state[self.prev_rev[revision]] == 'dead')):
1209       op = OP_ADD
1210     else:
1211       op = OP_CHANGE
1212
1213     if text:
1214       deltatext_code = DELTATEXT_NONEMPTY
1215     else:
1216       deltatext_code = DELTATEXT_EMPTY
1217
1218     c_rev = CVSRevision(Ctx(), timestamp, digest, op,
1219                         self.prev_rev[revision], revision,
1220                         self.next_rev.get(revision),
1221                         self.file_in_attic, self.file_executable,
1222                         self.file_size,
1223                         deltatext_code, self.fname,
1224                         self.mode, self.rev_to_branch_name(revision),
1225                         self.taglist.get(revision, []),
1226                         self.branchlist.get(revision, []))
1227     self.revs.write(str(c_rev) + "\n")
1228
1229     if not self.metadata_db.has_key(digest):
1230       self.metadata_db[digest] = (author, log)
1231
1232   def parse_completed(self):
1233     # Walk through all branches and tags and register them with
1234     # their parent branch in the symbol database.
1235     for revision, symbols in self.taglist.items() + self.branchlist.items():
1236       for symbol in symbols:
1237         name = self.rev_to_branch_name(revision)
1238         if name is not None:
1239           self.symbol_db.register_branch_blocker(name, symbol)
1240
1241     self.num_files = self.num_files + 1
1242
1243   def write_symbol_db(self):
1244     self.symbol_db.write()
1245
1246 class SymbolingsLogger:
1247   """Manage the file that contains lines for symbol openings and
1248   closings.
1249
1250   This data will later be used to determine valid SVNRevision ranges
1251   from which a file can be copied when creating a branch or tag in
1252   Subversion.  Do this by finding "Openings" and "Closings" for each
1253   file copied onto a branch or tag.
1254
1255   An "Opening" is the CVSRevision from which a given branch/tag
1256   sprouts on a path.
1257
1258   The "Closing" for that branch/tag and path is the next CVSRevision
1259   on the same line of development as the opening.
1260
1261   For example, on file 'foo.c', branch BEE has branch number 1.2.2 and
1262   obviously sprouts from revision 1.2.  Therefore, 1.2 is the opening
1263   for BEE on path 'foo.c', and 1.3 is the closing for BEE on path
1264   'foo.c'.  Note that there may be many revisions chronologically
1265   between 1.2 and 1.3, for example, revisions on branches of 'foo.c',
1266   perhaps even including on branch BEE itself.  But 1.3 is the next
1267   revision *on the same line* as 1.2, that is why it is the closing
1268   revision for those symbolic names of which 1.2 is the opening.
1269
1270   The reason for doing all this hullabaloo is to make branch and tag
1271   creation as efficient as possible by minimizing the number of copies
1272   and deletes per creation.  For example, revisions 1.2 and 1.3 of
1273   foo.c might correspond to revisions 17 and 30 in Subversion.  That
1274   means that when creating branch BEE, there is some motivation to do
1275   the copy from one of 17-30.  Now if there were another file,
1276   'bar.c', whose opening and closing CVSRevisions for BEE corresponded
1277   to revisions 24 and 39 in Subversion, we would know that the ideal
1278   thing would be to copy the branch from somewhere between 24 and 29,
1279   inclusive.
1280   """
1281   def __init__(self):
1282     self.symbolings = open(SYMBOL_OPENINGS_CLOSINGS, 'w')
1283     Cleanup().register(SYMBOL_OPENINGS_CLOSINGS, pass6)
1284     self.closings = open(SYMBOL_CLOSINGS_TMP, 'w')
1285     Cleanup().register(SYMBOL_CLOSINGS_TMP, pass5)
1286
1287     # This keys of this dictionary are Subversion repository *source*
1288     # paths for which we've encountered an 'opening'.  The values are
1289     # the symbolic names that this path has opened.  The only paths
1290     # that should be in this dict are paths whose corresponding
1291     # CVSRevision is a default branch revision.
1292     self.open_paths_with_default_branches = { }
1293
1294   def log_revision(self, c_rev, svn_revnum):
1295     """Log any openings found in C_REV, and if C_REV.next_rev is not
1296     None, a closing.  The opening uses SVN_REVNUM, but the closing (if
1297     any) will have its revnum determined later."""
1298     for name in c_rev.tags + c_rev.branches:
1299       name = _clean_symbolic_name(name)
1300       self._note_default_branch_opening(c_rev, name)
1301       if c_rev.op != OP_DELETE:
1302         self._log(name, svn_revnum, c_rev.svn_path, OPENING)
1303
1304       # If our c_rev has a next_rev, then that's the closing rev for
1305       # this source revision.  Log it to closings for later processing
1306       # since we don't know the svn_revnum yet.
1307       if c_rev.next_rev is not None:
1308         self.closings.write('%s %s\n' %
1309                             (name, c_rev.unique_key(c_rev.next_rev)))
1310
1311   def _log(self, name, svn_revnum, svn_path, type):
1312     """Write out a single line to the symbol_openings_closings file
1313     representing that svn_revnum of svn_path is either the opening or
1314     closing (TYPE) of NAME (a symbolic name).
1315
1316     TYPE should only be one of the following global constants:
1317     OPENING or CLOSING."""
1318     # 8 places gives us 999,999,999 SVN revs.  That *should* be enough.
1319     self.symbolings.write('%s %.8d %s %s\n' % (name, svn_revnum,
1320                                                type, svn_path))
1321
1322   def close(self):
1323     """Iterate through the closings file, lookup the svn_revnum for
1324     each closing CVSRevision, and write a proper line out to the
1325     symbolings file."""
1326     # Use this to get the c_rev.svn_path of our rev_key
1327     cvs_revs_db = CVSRevisionDatabase(DB_OPEN_READ)
1328
1329     self.closings.close()
1330     for line in fileinput.FileInput(SYMBOL_CLOSINGS_TMP):
1331       (name, rev_key) = line.rstrip().split(" ", 1)
1332       svn_revnum = Ctx()._persistence_manager.get_svn_revnum(rev_key)
1333
1334       c_rev = cvs_revs_db.get_revision(rev_key)
1335       self._log(name, svn_revnum, c_rev.svn_path, CLOSING)
1336
1337     self.symbolings.close()
1338
1339   def _note_default_branch_opening(self, c_rev, symbolic_name):
1340     """If C_REV is a default branch revision, log C_REV.svn_trunk_path
1341     as an opening for SYMBOLIC_NAME."""
1342     path = c_rev.svn_trunk_path
1343     if not self.open_paths_with_default_branches.has_key(path):
1344       self.open_paths_with_default_branches[path] = [ ]
1345     self.open_paths_with_default_branches[path].append(symbolic_name)
1346
1347   def log_default_branch_closing(self, c_rev, svn_revnum):
1348     """If self.open_paths_with_default_branches contains
1349     C_REV.svn_trunk_path, then call log each name in
1350     self.open_paths_with_default_branches[C_REV.svn_trunk_path] as a
1351     closing with SVN_REVNUM as the closing revision number. """
1352     path = c_rev.svn_trunk_path
1353     if self.open_paths_with_default_branches.has_key(path):
1354       # log each symbol as a closing
1355       for name in self.open_paths_with_default_branches[path]:
1356         self._log(name, svn_revnum, path, CLOSING)
1357       # Remove them from the openings list as we're done with them.
1358       del self.open_paths_with_default_branches[path]
1359
1360
1361 class PersistenceManager:
1362   """The PersistenceManager allows us to effectively store SVNCommits
1363   to disk and retrieve them later using only their subversion revision
1364   number as the key.  It also returns the subversion revision number
1365   for a given CVSRevision's unique key.
1366
1367   All information pertinent to each SVNCommit is stored in a series of
1368   on-disk databases so that SVNCommits can be retrieved on-demand.
1369
1370   MODE is one of the constants DB_OPEN_NEW or DB_OPEN_READ.
1371   In 'new' mode, PersistenceManager will initialize a new set of on-disk
1372   databases and be fully-featured.
1373   In 'read' mode, PersistenceManager will open existing on-disk databases
1374   and the set_* methods will be unavailable."""
1375   def __init__(self, mode):
1376     self.mode = mode
1377     if mode not in (DB_OPEN_NEW, DB_OPEN_READ):
1378       raise RuntimeError, "Invalid 'mode' argument to PersistenceManager"
1379     self.svn2cvs_db = Database(SVN_REVNUMS_TO_CVS_REVS, mode)
1380     Cleanup().register(SVN_REVNUMS_TO_CVS_REVS, pass8)
1381     self.cvs2svn_db = Database(CVS_REVS_TO_SVN_REVNUMS, mode)
1382     Cleanup().register(CVS_REVS_TO_SVN_REVNUMS, pass8)
1383     self.svn_commit_names_dates = Database(SVN_COMMIT_NAMES_DATES, mode)
1384     Cleanup().register(SVN_COMMIT_NAMES_DATES, pass8)
1385     self.svn_commit_metadata = Database(METADATA_DB, DB_OPEN_READ)
1386     self.cvs_revisions = CVSRevisionDatabase(DB_OPEN_READ)
1387     ###PERF kff Elsewhere there are comments about sucking the tags db
1388     ### into memory.  That seems like a good idea.
1389     if not Ctx().trunk_only:
1390       self.tags_db = TagsDatabase(DB_OPEN_READ)
1391       self.motivating_revnums = Database(MOTIVATING_REVNUMS, mode)
1392       Cleanup().register(MOTIVATING_REVNUMS, pass8)
1393
1394     # "branch_name" -> svn_revnum in which branch was last filled.
1395     # This is used by CVSCommit._pre_commit, to prevent creating a fill
1396     # revision which would have nothing to do.
1397     self.last_filled = {}
1398
1399   def total_revs(self):
1400     """Return the total number of Subversion revisions."""
1401     return self.svn2cvs_db.len()
1402
1403   def get_svn_revnum(self, cvs_rev_unique_key):
1404     """Return the Subversion revision number in which
1405     CVS_REV_UNIQUE_KEY was committed, or SVN_INVALID_REVNUM if there
1406     is no mapping for CVS_REV_UNIQUE_KEY."""
1407     return int(self.cvs2svn_db.get(cvs_rev_unique_key, SVN_INVALID_REVNUM))
1408
1409   def get_svn_commit(self, svn_revnum):
1410     """Return an SVNCommit that corresponds to SVN_REVNUM.
1411
1412     If no SVNCommit exists for revnum SVN_REVNUM, then return None.
1413
1414     This method can throw SVNCommitInternalInconsistencyError.
1415     """
1416     svn_commit = SVNCommit("Retrieved from disk", svn_revnum)
1417     c_rev_keys = self.svn2cvs_db.get(str(svn_revnum), None)
1418     if c_rev_keys == None:
1419       return None
1420
1421     digest = None
1422     for key in c_rev_keys:
1423       c_rev = self.cvs_revisions.get_revision(key)
1424       svn_commit.add_revision(c_rev)
1425       # Set the author and log message for this commit by using
1426       # CVSRevision metadata, but only if haven't done so already.
1427       if digest is None:
1428         digest = c_rev.digest
1429         author, log_msg = self.svn_commit_metadata[digest]
1430         svn_commit.set_author(author)
1431         svn_commit.set_log_msg(log_msg)
1432
1433     # If we're doing a trunk-only conversion, we don't need to do any more work.
1434     if Ctx().trunk_only:
1435       return svn_commit
1436
1437     name, date = self._get_name_and_date(svn_revnum)
1438     if name:
1439       svn_commit.set_symbolic_name(name)
1440       svn_commit.set_date(date)
1441       if self.tags_db.has_key(name):
1442         svn_commit.is_tag = 1
1443
1444     motivating_revnum = self.motivating_revnums.get(str(svn_revnum), None)
1445     if motivating_revnum:
1446       svn_commit.set_motivating_revnum(int(motivating_revnum))
1447       svn_commit.set_date(date)
1448
1449     if len(svn_commit.cvs_revs) and name:
1450       msg = """An SVNCommit cannot have cvs_revisions *and* a
1451       corresponding symbolic name ('%s') to fill.""" % name
1452       raise SVNCommit.SVNCommitInternalInconsistencyError(msg)
1453
1454     return svn_commit
1455
1456   def set_cvs_revs(self, svn_revnum, cvs_revs):
1457     """Record the bidirectional mapping between SVN_REVNUM and
1458     CVS_REVS."""
1459     if self.mode == DB_OPEN_READ:
1460       raise RuntimeError, \
1461           'Write operation attempted on read-only PersistenceManager'
1462     for c_rev in cvs_revs:
1463       Log().write(LOG_VERBOSE, " ", c_rev.unique_key())
1464     self.svn2cvs_db[str(svn_revnum)] = [x.unique_key() for x in cvs_revs]
1465     for c_rev in cvs_revs:
1466       self.cvs2svn_db[c_rev.unique_key()] = svn_revnum
1467
1468   def set_name_and_date(self, svn_revnum, name, date):
1469     """Associate symbolic name NAME and DATE with SVN_REVNUM."""
1470     if self.mode == DB_OPEN_READ:
1471       raise RuntimeError, \
1472           'Write operation attempted on read-only PersistenceManager'
1473     self.svn_commit_names_dates[str(svn_revnum)] = (name, date)
1474     self.last_filled[name] = svn_revnum
1475
1476   def _get_name_and_date(self, svn_revnum):
1477     """Return a tuple containing the symbolic name and date associated
1478     with SVN_REVNUM, or (None, None) if SVN_REVNUM has no such data
1479     associated with it."""
1480     return self.svn_commit_names_dates.get(str(svn_revnum), (None, None))
1481
1482   def set_motivating_revnum(self, svn_revnum, motivating_revnum):
1483     """Store MOTIVATING_REVNUM as the value of SVN_REVNUM"""
1484     if self.mode == DB_OPEN_READ:
1485       raise RuntimeError, \
1486           'Write operation attempted on read-only PersistenceManager'
1487     self.motivating_revnums[str(svn_revnum)] = str(motivating_revnum)
1488
1489
1490 class CVSCommit:
1491   """Each instance of this class contains a number of CVS Revisions
1492   that correspond to one or more Subversion Commits.  After all CVS
1493   Revisions are added to the grouping, calling process_revisions will
1494   generate a Subversion Commit (or Commits) for the set of CVS
1495   Revisions in the grouping."""
1496
1497   def __init__(self, digest, author, log):
1498     self.digest = digest
1499     self.author = author
1500     self.log = log
1501
1502     # Symbolic names for which the last source revision has already
1503     # been seen and for which the CVSRevisionAggregator has already
1504     # generated a fill SVNCommit.  See self.process_revisions().
1505     self.done_symbols = [ ]
1506
1507     self.files = { }
1508     # Lists of CVSRevisions
1509     self.changes = [ ]
1510     self.deletes = [ ]
1511
1512     # Start out with a t_min higher than any incoming time T, and a
1513     # t_max lower than any incoming T.  This way the first T will
1514     # push t_min down to T, and t_max up to T, naturally (without any
1515     # special-casing), and successive times will then ratchet them
1516     # outward as appropriate.
1517     self.t_min = 1L<<32
1518     self.t_max = 0
1519
1520     # This will be set to the SVNCommit that occurs in self._commit.
1521     self.motivating_commit = None
1522
1523     # This is a list of all non-primary commits motivated by the main
1524     # commit.  We gather these so that we can set their dates to the
1525     # same date as the primary commit.
1526     self.secondary_commits = [ ]
1527
1528     # State for handling default branches.
1529     #
1530     # Here is a tempting, but ultimately nugatory, bit of logic, which
1531     # I share with you so you may appreciate the less attractive, but
1532     # refreshingly non-nugatory, logic which follows it:
1533     #
1534     # If some of the commits in this txn happened on a non-trunk
1535     # default branch, then those files will have to be copied into
1536     # trunk manually after being changed on the branch (because the
1537     # RCS "default branch" appears as head, i.e., trunk, in practice).
1538     # As long as those copies don't overwrite any trunk paths that
1539     # were also changed in this commit, then we can do the copies in
1540     # the same revision, because they won't cover changes that don't
1541     # appear anywhere/anywhen else.  However, if some of the trunk dst
1542     # paths *did* change in this commit, then immediately copying the
1543     # branch changes would lose those trunk mods forever.  So in this
1544     # case, we need to do at least that copy in its own revision.  And
1545     # for simplicity's sake, if we're creating the new revision for
1546     # even one file, then we just do all such copies together in the
1547     # new revision.
1548     #
1549     # Doesn't that sound nice?
1550     #
1551     # Unfortunately, Subversion doesn't support copies with sources
1552     # in the current txn.  All copies must be based in committed
1553     # revisions.  Therefore, we generate the above-described new
1554     # revision unconditionally.
1555     #
1556     # This is a list of c_revs, and a c_rev is appended for each
1557     # default branch commit that will need to be copied to trunk (or
1558     # deleted from trunk) in some generated revision following the
1559     # "regular" revision.
1560     self.default_branch_cvs_revisions = [ ]
1561
1562   def __cmp__(self, other):
1563     # Commits should be sorted by t_max.  If both self and other have
1564     # the same t_max, break the tie using t_min, and lastly, digest
1565     return (cmp(self.t_max, other.t_max) or cmp(self.t_min, other.t_min)
1566             or cmp(self.digest, other.digest))
1567
1568   def has_file(self, fname):
1569     return self.files.has_key(fname)
1570
1571   def revisions(self):
1572     return self.changes + self.deletes
1573
1574   def opens_symbolic_name(self, name):
1575     """Returns true if any CVSRevision in this commit is on a tag or a
1576     branch or is the origin of a tag or branch."""
1577     for c_rev in self.revisions():
1578       if c_rev.opens_symbolic_name(name):
1579         return 1
1580     return 0
1581
1582   def add_revision(self, c_rev):
1583     # Record the time range of this commit.
1584     #
1585     # ### ISSUE: It's possible, though unlikely, that the time range
1586     # of a commit could get gradually expanded to be arbitrarily
1587     # longer than COMMIT_THRESHOLD.  I'm not sure this is a huge
1588     # problem, and anyway deciding where to break it up would be a
1589     # judgement call.  For now, we just print a warning in commit() if
1590     # this happens.
1591     if c_rev.timestamp < self.t_min:
1592       self.t_min = c_rev.timestamp
1593     if c_rev.timestamp > self.t_max:
1594       self.t_max = c_rev.timestamp
1595
1596     if c_rev.op == OP_DELETE:
1597       self.deletes.append(c_rev)
1598     else:
1599       # OP_CHANGE or OP_ADD
1600       self.changes.append(c_rev)
1601
1602     self.files[c_rev.fname] = 1
1603
1604   def _pre_commit(self):
1605     """Generates any SVNCommits that must exist before the main
1606     commit."""
1607
1608     # There may be multiple c_revs in this commit that would cause
1609     # branch B to be filled, but we only want to fill B once.  On the
1610     # other hand, there might be multiple branches committed on in
1611     # this commit.  Whatever the case, we should count exactly one
1612     # commit per branch, because we only fill a branch once per
1613     # CVSCommit.  This list tracks which branches we've already
1614     # counted.
1615     accounted_for_sym_names = [ ]
1616
1617     def fill_needed(c_rev, pm):
1618       """Return 1 if this is the first commit on a new branch (for
1619       this file) and we need to fill the branch; else return 0
1620       (meaning that some other file's first commit on the branch has
1621       already done the fill for us).
1622
1623       If C_REV.op is OP_ADD, only return 1 if the branch that this
1624       commit is on has no last filled revision.
1625
1626       PM is a PersistenceManager to query.
1627       """
1628
1629       # Different '.' counts indicate that c_rev is now on a different
1630       # line of development (and may need a fill)
1631       if c_rev.rev.count('.') != c_rev.prev_rev.count('.'):
1632         svn_revnum = pm.get_svn_revnum(c_rev.unique_key(c_rev.prev_rev))
1633         # It should be the case that when we have a file F that
1634         # is added on branch B (thus, F on trunk is in state
1635         # 'dead'), we generate an SVNCommit to fill B iff the branch
1636         # has never been filled before.
1637         #
1638         # If this c_rev.op == OP_ADD, *and* the branch has never
1639         # been filled before, then fill it now.  Otherwise, no need to
1640         # fill it.
1641         if c_rev.op == OP_ADD:
1642           if pm.last_filled.get(c_rev.branch_name, None) is None:
1643             return 1
1644         else:
1645           if svn_revnum > pm.last_filled.get(c_rev.branch_name, 0):
1646             return 1
1647       return 0
1648
1649     for c_rev in self.changes + self.deletes:
1650       # If a commit is on a branch, we must ensure that the branch
1651       # path being committed exists (in HEAD of the Subversion
1652       # repository).  If it doesn't exist, we will need to fill the
1653       # branch.  After the fill, the path on which we're committing
1654       # will exist.
1655       if c_rev.branch_name \
1656           and c_rev.branch_name not in accounted_for_sym_names \
1657           and c_rev.branch_name not in self.done_symbols \
1658           and fill_needed(c_rev, Ctx()._persistence_manager):
1659         svn_commit = SVNCommit("pre-commit symbolic name '%s'"
1660                                % c_rev.branch_name)
1661         svn_commit.set_symbolic_name(c_rev.branch_name)
1662         self.secondary_commits.append(svn_commit)
1663         accounted_for_sym_names.append(c_rev.branch_name)
1664
1665   def _commit(self):
1666     """Generates the primary SVNCommit that corresponds the this
1667     CVSCommit."""
1668     # Generate an SVNCommit unconditionally.  Even if the only change
1669     # in this CVSCommit is a deletion of an already-deleted file (that
1670     # is, a CVS revision in state 'dead' whose predecessor was also in
1671     # state 'dead'), the conversion will still generate a Subversion
1672     # revision containing the log message for the second dead
1673     # revision, because we don't want to lose that information.
1674     svn_commit = SVNCommit("commit")
1675     self.motivating_commit = svn_commit
1676
1677     for c_rev in self.changes:
1678       svn_commit.add_revision(c_rev)
1679       # Only make a change if we need to.  When 1.1.1.1 has an empty
1680       # deltatext, the explanation is almost always that we're looking
1681       # at an imported file whose 1.1 and 1.1.1.1 are identical.  On
1682       # such imports, CVS creates an RCS file where 1.1 has the
1683       # content, and 1.1.1.1 has an empty deltatext, i.e, the same
1684       # content as 1.1.  There's no reason to reflect this non-change
1685       # in the repository, so we want to do nothing in this case.  (If
1686       # we were really paranoid, we could make sure 1.1's log message
1687       # is the CVS-generated "Initial revision\n", but I think the
1688       # conditions below are strict enough.)
1689       if not ((c_rev.deltatext_code == DELTATEXT_EMPTY)
1690               and (c_rev.rev == "1.1.1.1")):
1691         if c_rev.is_default_branch_revision():
1692           self.default_branch_cvs_revisions.append(c_rev)
1693
1694     for c_rev in self.deletes:
1695       # When a file is added on a branch, CVS not only adds the file
1696       # on the branch, but generates a trunk revision (typically
1697       # 1.1) for that file in state 'dead'.  We only want to add
1698       # this revision if the log message is not the standard cvs
1699       # fabricated log message.
1700       if c_rev.prev_rev is None:
1701         # c_rev.branches may be empty if the originating branch
1702         # has been excluded.
1703         if not c_rev.branches:
1704           continue
1705         cvs_generated_msg = ('file %s was initially added on branch %s.\n'
1706                              % (c_rev.filename(),
1707                                 c_rev.branches[0]))
1708         author, log_msg = \
1709             Ctx()._persistence_manager.svn_commit_metadata[c_rev.digest]
1710         if log_msg == cvs_generated_msg:
1711           continue
1712
1713       svn_commit.add_revision(c_rev)
1714       if c_rev.is_default_branch_revision():
1715         self.default_branch_cvs_revisions.append(c_rev)
1716
1717     # There is a slight chance that we didn't actually register any
1718     # CVSRevisions with our SVNCommit (see loop over self.deletes
1719     # above), so if we have no CVSRevisions, we don't flush the
1720     # svn_commit to disk and roll back our revnum.
1721     if len(svn_commit.cvs_revs) > 0:
1722       svn_commit.flush()
1723     else:
1724       # We will not be flushing this SVNCommit, so rollback the
1725       # SVNCommit revision counter.
1726       SVNCommit.revnum = SVNCommit.revnum - 1
1727
1728     if not Ctx().trunk_only:
1729       for c_rev in self.revisions():
1730         Ctx()._symbolings_logger.log_revision(c_rev, svn_commit.revnum)
1731
1732   def _post_commit(self):
1733     """Generates any SVNCommits that we can perform now that _commit
1734     has happened.  That is, handle non-trunk default branches.
1735     Sometimes an RCS file has a non-trunk default branch, so a commit
1736     on that default branch would be visible in a default CVS checkout
1737     of HEAD.  If we don't copy that commit over to Subversion's trunk,
1738     then there will be no Subversion tree which corresponds to that
1739     CVS checkout.  Of course, in order to copy the path over, we may
1740     first need to delete the existing trunk there.  """
1741
1742     # Only generate a commit if we have default branch revs
1743     if len(self.default_branch_cvs_revisions):
1744       # Generate an SVNCommit for all of our default branch c_revs.
1745       svn_commit = SVNCommit("post-commit default branch(es)")
1746       svn_commit.set_motivating_revnum(self.motivating_commit.revnum)
1747       for c_rev in self.default_branch_cvs_revisions:
1748         svn_commit.add_revision(c_rev)
1749         Ctx()._symbolings_logger.log_default_branch_closing(c_rev,
1750                                                             svn_commit.revnum)
1751       self.secondary_commits.append(svn_commit)
1752
1753   def process_revisions(self, done_symbols):
1754     """Process all the CVSRevisions that this instance has, creating
1755     one or more SVNCommits in the process.  Generate fill SVNCommits
1756     only for symbols not in DONE_SYMBOLS (avoids unnecessary
1757     fills).
1758
1759     Return the primary SVNCommit that corresponds to this CVSCommit.
1760     The returned SVNCommit is the commit that motivated any other
1761     SVNCommits generated in this CVSCommit."""
1762     self.done_symbols = done_symbols
1763     seconds = self.t_max - self.t_min + 1
1764
1765     Log().write(LOG_VERBOSE, '-' * 60)
1766     Log().write(LOG_VERBOSE, 'CVS Revision grouping:')
1767     if seconds == 1:
1768       Log().write(LOG_VERBOSE, '  Start time: %s (duration: 1 second)'
1769                   % time.ctime(self.t_max))
1770     else:
1771       Log().write(LOG_VERBOSE, '  Start time: %s' % time.ctime(self.t_min))
1772       Log().write(LOG_VERBOSE, '  End time:   %s (duration: %d seconds)'
1773                   % (time.ctime(self.t_max), seconds))
1774
1775     if seconds > COMMIT_THRESHOLD + 1:
1776       Log().write(LOG_WARN, '%s: grouping spans more than %d seconds'
1777                   % (warning_prefix, COMMIT_THRESHOLD))
1778
1779     if Ctx().trunk_only: # Only do the primary commit if we're trunk-only
1780       self._commit()
1781       return self.motivating_commit
1782
1783     self._pre_commit()
1784     self._commit()
1785     self._post_commit()
1786
1787     for svn_commit in self.secondary_commits:
1788       svn_commit.set_date(self.motivating_commit.get_date())
1789       svn_commit.flush()
1790
1791     return self.motivating_commit
1792
1793
1794 class SVNCommit:
1795   """This represents one commit to the Subversion Repository.  There
1796   are three types of SVNCommits:
1797
1798   1. Commits one or more CVSRevisions (cannot fill a symbolic name).
1799
1800   2. Creates or fills a symbolic name (cannot commit CVSRevisions).
1801
1802   3. Updates trunk to reflect the contents of a particular branch
1803      (this is to handle RCS default branches)."""
1804
1805   # The revision number to assign to the next new SVNCommit.
1806   # We start at 2 because SVNRepositoryMirror uses the first commit
1807   # to create trunk, tags, and branches.
1808   revnum = 2
1809
1810   class SVNCommitInternalInconsistencyError(Exception):
1811     """Exception raised if we encounter an impossible state in the
1812     SVNCommit Databases."""
1813     pass
1814
1815   def __init__(self, description="", revnum=None, cvs_revs=None):
1816     """Instantiate an SVNCommit.  DESCRIPTION is for debugging only.
1817     If REVNUM, the SVNCommit will correspond to that revision number;
1818     and if CVS_REVS, then they must be the exact set of CVSRevisions for
1819     REVNUM.
1820
1821     It is an error to pass CVS_REVS without REVNUM, but you may pass
1822     REVNUM without CVS_REVS, and then add a revision at a time by
1823     invoking add_revision()."""
1824     self._description = description
1825
1826     # Revprop metadata for this commit.
1827     #
1828     # These initial values are placeholders.  At least the log and the
1829     # date should be different by the time these are used.
1830     #
1831     # They are private because their values should be returned encoded
1832     # in UTF8, but callers aren't required to set them in UTF8.
1833     # Therefore, accessor methods are used to set them, and
1834     # self.get_revprops() is used to to get them, in dictionary form.
1835     self._author = Ctx().username
1836     self._log_msg = "This log message means an SVNCommit was used too soon."
1837     self._max_date = 0  # Latest date seen so far.
1838
1839     self.cvs_revs = cvs_revs or []
1840     if revnum:
1841       self.revnum = revnum
1842     else:
1843       self.revnum = SVNCommit.revnum
1844       SVNCommit.revnum = SVNCommit.revnum + 1
1845
1846     # The symbolic name that is filled in this SVNCommit, if any
1847     self.symbolic_name = None
1848
1849     # If this commit is a default branch synchronization, this
1850     # variable represents the subversion revision number of the
1851     # *primary* commit where the default branch changes actually
1852     # happened.  It is None otherwise.
1853     #
1854     # It is possible for multiple for multiple synchronization commits
1855     # to refer to the same motivating commit revision number, and it
1856     # is possible for a single synchronization commit to contain
1857     # CVSRevisions on multiple different default branches.
1858     self.motivating_revnum = None
1859
1860     # is_tag is true only if this commit is a fill of a symbolic name
1861     # that is a tag, None in all other cases.
1862     self.is_tag = None
1863
1864   def set_symbolic_name(self, name):
1865     "Set self.symbolic_name to NAME."
1866     name = _clean_symbolic_name(name)
1867     self.symbolic_name = name
1868
1869   def set_motivating_revnum(self, revnum):
1870     "Set self.motivating_revnum to REVNUM."
1871     self.motivating_revnum = revnum
1872
1873   def set_author(self, author):
1874     """Set this SVNCommit's author to AUTHOR (a locally-encoded string).
1875     This is the only way to set an SVNCommit's author."""
1876     self._author = author
1877
1878   def set_log_msg(self, msg):
1879     """Set this SVNCommit's log message to MSG (a locally-encoded string).
1880     This is the only way to set an SVNCommit's log message."""
1881     self._log_msg = msg
1882
1883   def set_date(self, date):
1884     """Set this SVNCommit's date to DATE (an integer).
1885     Note that self.add_revision() updates this automatically based on
1886     a CVSRevision; so you may not need to call this at all, and even
1887     if you do, the value may be overwritten by a later call to
1888     self.add_revision()."""
1889     self._max_date = date
1890
1891   def get_date(self):
1892     """Returns this SVNCommit's date as an integer."""
1893     return self._max_date
1894
1895   def get_revprops(self):
1896     """Return the Subversion revprops for this SVNCommit."""
1897     date = format_date(self._max_date)
1898     try:
1899       ### FIXME: The 'replace' behavior should be an option, like
1900       ### --encoding is.
1901       utf8_author = None
1902       if self._author is not None:
1903         unicode_author = unicode(self._author, Ctx().encoding, 'replace')
1904         utf8_author = unicode_author.encode('utf8')
1905       unicode_log = unicode(self.get_log_msg(), Ctx().encoding, 'replace')
1906       utf8_log = unicode_log.encode('utf8')
1907       return { 'svn:author' : utf8_author,
1908                'svn:log'    : utf8_log,
1909                'svn:date'   : date }
1910     except UnicodeError:
1911       Log().write(LOG_WARN, '%s: problem encoding author or log message:'
1912                   % warning_prefix)
1913       Log().write(LOG_WARN, "  author: '%s'" % self._author)
1914       Log().write(LOG_WARN, "  log:    '%s'" % self.get_log_msg().rstrip())
1915       Log().write(LOG_WARN, "  date:   '%s'" % date)
1916       Log().write(LOG_WARN, "(subversion rev %s)  Related files:" % self.revnum)
1917       for c_rev in self.cvs_revs:
1918         Log().write(LOG_WARN, " ", c_rev.fname)
1919
1920       Log().write(LOG_WARN, "Consider rerunning with (for example)",
1921                   "'--encoding=latin1'.\n")
1922       # It's better to fall back to the original (unknown encoding) data
1923       # than to either 1) quit or 2) record nothing at all.
1924       return { 'svn:author' : self._author,
1925                'svn:log'    : self.get_log_msg(),
1926                'svn:date'   : date }
1927
1928   def add_revision(self, cvs_rev):
1929     self.cvs_revs.append(cvs_rev)
1930     if cvs_rev.timestamp > self._max_date:
1931       self._max_date = cvs_rev.timestamp
1932
1933   def _is_primary_commit(self):
1934     """Return true if this is a primary SVNCommit, false otherwise."""
1935     return not (self.symbolic_name or self.motivating_revnum)
1936
1937   def flush(self):
1938     Log().write(LOG_NORMAL, "Creating Subversion commit %d (%s)"
1939                 % (self.revnum, self._description))
1940     Ctx()._persistence_manager.set_cvs_revs(self.revnum, self.cvs_revs)
1941
1942     if self.motivating_revnum is not None:
1943       Ctx()._persistence_manager.set_motivating_revnum(self.revnum,
1944                                                        self.motivating_revnum)
1945
1946     # If we're not a primary commit, then store our date and/or our
1947     # symbolic_name
1948     if not self._is_primary_commit():
1949       Ctx()._persistence_manager.set_name_and_date(self.revnum,
1950                                                    self.symbolic_name,
1951                                                    self._max_date)
1952
1953   def __str__(self):
1954     """ Print a human-readable description of this SVNCommit.  This
1955     description is not intended to be machine-parseable (although
1956     we're not going to stop you if you try!)"""
1957
1958     ret = "SVNCommit #: " + str(self.revnum) + "\n"
1959     if self.symbolic_name:
1960       ret = ret + "   symbolic name: " +  self.symbolic_name + "\n"
1961     else:
1962       ret = ret + "   NO symbolic name\n"
1963     ret = ret + "   debug description: " + self._description + "\n"
1964     ret = ret + "   cvs_revs:\n"
1965     for c_rev in self.cvs_revs:
1966       ret = ret + "     " + c_rev.unique_key() + "\n"
1967     return ret
1968
1969   def get_log_msg(self):
1970     """Returns the actual log message for a primary commit, and the
1971     appropriate manufactured log message for a secondary commit."""
1972     if self.symbolic_name is not None:
1973       return self._log_msg_for_symbolic_name_commit()
1974     elif self.motivating_revnum is not None:
1975       return self._log_msg_for_default_branch_commit()
1976     else:
1977       return self._log_msg
1978
1979   def _log_msg_for_symbolic_name_commit(self):
1980     """Creates a log message for a manufactured commit that fills
1981     self.symbolic_name.  If self.is_tag is true, write the log message
1982     as though for a tag, else write it as though for a branch."""
1983     type = 'branch'
1984     if self.is_tag:
1985       type = 'tag'
1986
1987     # In Python 2.2.3, we could use textwrap.fill().  Oh well :-).
1988     space_or_newline = ' '
1989     if len(self.symbolic_name) >= 13:
1990       space_or_newline = '\n'
1991
1992     return "This commit was manufactured by cvs2svn to create %s%s'%s'." \
1993            % (type, space_or_newline, self.symbolic_name)
1994
1995   def _log_msg_for_default_branch_commit(self):
1996     """Creates a log message for a manufactured commit that
1997     synchronizes a non-trunk default branch with trunk."""
1998     msg = 'This commit was generated by cvs2svn to compensate for '     \
1999           'changes in r%d,\n'                                           \
2000           'which included commits to RCS files with non-trunk default ' \
2001           'branches.\n' % self.motivating_revnum
2002     return msg
2003
2004 class CVSRevisionAggregator:
2005   """This class groups CVSRevisions into CVSCommits that represent
2006   at least one SVNCommit."""
2007   def __init__(self):
2008     self.metadata_db = Database(METADATA_DB, DB_OPEN_READ)
2009     if not Ctx().trunk_only:
2010       self.last_revs_db = Database(SYMBOL_LAST_CVS_REVS_DB, DB_OPEN_READ)
2011     self.cvs_commits = {}
2012     self.pending_symbols = {}
2013     # A list of symbols for which we've already encountered the last
2014     # CVSRevision that is a source for that symbol.  That is, the
2015     # final fill for this symbol has been done, and we never need to
2016     # fill it again.
2017     self.done_symbols = [ ]
2018
2019     # This variable holds the most recently created primary svn_commit
2020     # object.  CVSRevisionAggregator maintains this variable merely
2021     # for its date, so that it can set dates for the SVNCommits
2022     # created in self.attempt_to_commit_symbols().
2023     self.latest_primary_svn_commit = None
2024
2025     Ctx()._symbolings_logger = SymbolingsLogger()
2026     Ctx()._persistence_manager = PersistenceManager(DB_OPEN_NEW)
2027     Ctx()._default_branches_db = Database(DEFAULT_BRANCHES_DB, DB_OPEN_READ)
2028
2029
2030   def process_revision(self, c_rev):
2031     # Each time we read a new line, we scan the commits we've
2032     # accumulated so far to see if any are ready for processing now.
2033     ready_queue = [ ]
2034     for digest_key, cvs_commit in self.cvs_commits.items():
2035       if cvs_commit.t_max + COMMIT_THRESHOLD < c_rev.timestamp:
2036         ready_queue.append(cvs_commit)
2037         del self.cvs_commits[digest_key]
2038         continue
2039       # If the inbound commit is on the same file as a pending commit,
2040       # close the pending commit to further changes.  Don't flush it though,
2041       # as there may be other pending commits dated before this one.
2042       # ### ISSUE: the has_file() check below is not optimal.
2043       # It does fix the dataloss bug where revisions would get lost
2044       # if checked in too quickly, but it can also break apart the
2045       # commits.  The correct fix would require tracking the dependencies
2046       # between change sets and committing them in proper order.
2047       if cvs_commit.has_file(c_rev.fname):
2048         unused_id = digest_key + '-'
2049         # Find a string that does is not already a key in
2050         # the self.cvs_commits dict
2051         while self.cvs_commits.has_key(unused_id):
2052           unused_id = unused_id + '-'
2053         self.cvs_commits[unused_id] = cvs_commit
2054         del self.cvs_commits[digest_key]
2055
2056     # Add this item into the set of still-available commits.
2057     if self.cvs_commits.has_key(c_rev.digest):
2058       cvs_commit = self.cvs_commits[c_rev.digest]
2059     else:
2060       author, log = self.metadata_db[c_rev.digest]
2061       self.cvs_commits[c_rev.digest] = CVSCommit(c_rev.digest,
2062                                                  author, log)
2063       cvs_commit = self.cvs_commits[c_rev.digest]
2064     cvs_commit.add_revision(c_rev)
2065
2066     # If there are any elements in the ready_queue at this point, they
2067     # need to be processed, because this latest rev couldn't possibly
2068     # be part of any of them.  Sort them into time-order, then process
2069     # 'em.
2070     ready_queue.sort()
2071
2072     # Make sure we attempt_to_commit_symbols for this c_rev, even if no
2073     # commits are ready.
2074     if len(ready_queue) == 0:
2075       self.attempt_to_commit_symbols(ready_queue, c_rev)
2076
2077     for cvs_commit in ready_queue[:]:
2078       self.latest_primary_svn_commit \
2079           = cvs_commit.process_revisions(self.done_symbols)
2080       ready_queue.remove(cvs_commit)
2081       self.attempt_to_commit_symbols(ready_queue, c_rev)
2082
2083   def flush(self):
2084     """Commit anything left in self.cvs_commits.  Then inform the
2085     SymbolingsLogger that all commits are done."""
2086
2087     ready_queue = [ ]
2088     for k, v in self.cvs_commits.items():
2089       ready_queue.append((v, k))
2090
2091     ready_queue.sort()
2092     for cvs_commit_tuple in ready_queue[:]:
2093       self.latest_primary_svn_commit = \
2094         cvs_commit_tuple[0].process_revisions(self.done_symbols)
2095       ready_queue.remove(cvs_commit_tuple)
2096       del self.cvs_commits[cvs_commit_tuple[1]]
2097       self.attempt_to_commit_symbols([])
2098
2099     if not Ctx().trunk_only:
2100       Ctx()._symbolings_logger.close()
2101
2102   def attempt_to_commit_symbols(self, queued_commits, c_rev=None):
2103     """
2104     This function generates 1 SVNCommit for each symbol in
2105     self.pending_symbols that doesn't have an opening CVSRevision in
2106     either QUEUED_COMMITS or self.cvs_commits.values().
2107
2108     If C_REV is not None, then we first add to self.pending_symbols
2109     any symbols from C_REV that C_REV is the last CVSRevision for.
2110     """
2111     # If we're not doing a trunk-only conversion, get the symbolic
2112     # names that this c_rev is the last *source* CVSRevision for and
2113     # add them to those left over from previous passes through the
2114     # aggregator.
2115     if c_rev and not Ctx().trunk_only:
2116       for sym in self.last_revs_db.get(c_rev.unique_key(), []):
2117         self.pending_symbols[sym] = None
2118
2119     # Make a list of all symbols that still have *source* CVSRevisions
2120     # in the pending commit queue (self.cvs_commits).
2121     open_symbols = {}
2122     for sym in self.pending_symbols.keys():
2123       for cvs_commit in self.cvs_commits.values() + queued_commits:
2124         if cvs_commit.opens_symbolic_name(sym):
2125           open_symbols[sym] = None
2126           break
2127
2128     # Sort the pending symbols so that we will always process the
2129     # symbols in the same order, regardless of the order in which the
2130     # dict hashing algorithm hands them back to us.  We do this so
2131     # that our tests will get the same results on all platforms.
2132     sorted_pending_symbols_keys = self.pending_symbols.keys()
2133     sorted_pending_symbols_keys.sort()
2134     for sym in sorted_pending_symbols_keys:
2135       if open_symbols.has_key(sym): # sym is still open--don't close it.
2136         continue
2137       svn_commit = SVNCommit("closing tag/branch '%s'" % sym)
2138       svn_commit.set_symbolic_name(sym)
2139       svn_commit.set_date(self.latest_primary_svn_commit.get_date())
2140       svn_commit.flush()
2141       self.done_symbols.append(sym)
2142       del self.pending_symbols[sym]
2143
2144
2145 class SymbolingsReader:
2146   """Provides an interface to the SYMBOL_OPENINGS_CLOSINGS_SORTED file
2147   and the SYMBOL_OFFSETS_DB.  Does the heavy lifting of finding and
2148   returning the correct opening and closing Subversion revision
2149   numbers for a given symbolic name."""
2150   def __init__(self):
2151     """Opens the SYMBOL_OPENINGS_CLOSINGS_SORTED for reading, and
2152     reads the offsets database into memory."""
2153     self.symbolings = open(SYMBOL_OPENINGS_CLOSINGS_SORTED, 'r')
2154     # The offsets_db is really small, and we need to read and write
2155     # from it a fair bit, so suck it into memory
2156     offsets_db = Database(SYMBOL_OFFSETS_DB, DB_OPEN_READ)
2157     self.offsets = { }
2158     for key in offsets_db.db.keys():
2159       #print " ZOO:", key, offsets_db[key]
2160       self.offsets[key] = offsets_db[key]
2161
2162   def filling_guide_for_symbol(self, symbolic_name, svn_revnum):
2163     """Given SYMBOLIC_NAME and SVN_REVNUM, return a new
2164     SymbolicNameFillingGuide object.
2165
2166     Note that if we encounter an opening rev in this fill, but the
2167     corresponding closing rev takes place later than SVN_REVNUM, the
2168     closing will not be passed to SymbolicNameFillingGuide in this
2169     fill (and will be discarded when encountered in a later fill).
2170     This is perfectly fine, because we can still do a valid fill
2171     without the closing--we always try to fill what we can as soon as
2172     we can."""
2173     # It's possible to have a branch start with a file that was added
2174     # on a branch
2175     if not self.offsets.has_key(symbolic_name):
2176       return SymbolicNameFillingGuide(symbolic_name)
2177     # set our read offset for self.symbolings to the offset for
2178     # symbolic_name
2179     self.symbolings.seek(self.offsets[symbolic_name])
2180
2181     symbol_fill = SymbolicNameFillingGuide(symbolic_name)
2182     while (1):
2183       fpos = self.symbolings.tell()
2184       line = self.symbolings.readline().rstrip()
2185       if not line:
2186         break
2187       name, revnum, type, svn_path = line.split(" ", 3)
2188       revnum = int(revnum)
2189       if (revnum > svn_revnum
2190           or name != symbolic_name):
2191         break
2192       symbol_fill.register(svn_path, revnum, type)
2193
2194     # get current offset of the read marker and set it to the offset
2195     # for the beginning of the line we just read if we used anything
2196     # we read.
2197     if not symbol_fill.is_empty():
2198       self.offsets[symbolic_name] = fpos
2199
2200     symbol_fill.make_node_tree()
2201     return symbol_fill
2202
2203
2204 class SymbolicNameFillingGuide:
2205   """A SymbolicNameFillingGuide is essentially a node tree
2206   representing the source paths to be copied to fill
2207   self.symbolic_name in the current SVNCommit.
2208
2209   After calling self.register() on a series of openings and closings,
2210   call self.make_node_tree() to prepare self.node_tree for
2211   examination.  See the docstring for self.make_node_tree() for
2212   details on the structure of self.node_tree.
2213
2214   By walking self.node_tree and calling self.get_best_revnum() on each
2215   node, the caller can determine what subversion revision number to
2216   copy the path corresponding to that node from.  self.node_tree
2217   should be treated as read-only.
2218
2219   The caller can then descend to sub-nodes to see if their "best
2220   revnum" differs from their parents' and if it does, take appropriate
2221   actions to "patch up" the subtrees."""
2222   def __init__(self, symbolic_name):
2223     """Initializes a SymbolicNameFillingGuide for SYMBOLIC_NAME and
2224     prepares it for receiving openings and closings.
2225
2226     Returns a fully functional and armed SymbolicNameFillingGuide
2227     object."""
2228     self.name = symbolic_name
2229
2230     self.opening_key = "/o"
2231     self.closing_key = "/c"
2232
2233     # A dictionary of SVN_PATHS and SVN_REVNUMS whose format is:
2234     #
2235     # { svn_path : { self.opening_key : svn_revnum,
2236     #                self.closing_key : svn_revnum }
2237     #                ...}
2238     self.things = { }
2239
2240     # The key for the root node of the node tree
2241     self.root_key = '0'
2242     # The dictionary that holds our node tree, seeded with the root key.
2243     self.node_tree = { self.root_key : { } }
2244
2245   def get_best_revnum(self, node, preferred_revnum):
2246     """Determine the best subversion revision number to use when
2247     copying the source tree beginning at NODE.  Returns a
2248     subversion revision number.
2249
2250     PREFERRED_REVNUM is passed to self._best_rev and used to
2251     calculate the best_revnum."""
2252     revnum = SVN_INVALID_REVNUM
2253
2254     # Aggregate openings and closings from the rev tree
2255     openings = self._list_revnums_for_key(node, self.opening_key)
2256     closings = self._list_revnums_for_key(node, self.closing_key)
2257
2258     # Score the lists
2259     scores = self._score_revisions(self._sum_revnum_counts(openings),
2260                                   self._sum_revnum_counts(closings))
2261
2262     revnum, max_score = self._best_rev(scores, preferred_revnum)
2263
2264     if revnum == SVN_INVALID_REVNUM:
2265       sys.stderr.write(error_prefix + ": failed to find a revision "
2266                        + "to copy from when copying %s\n" % name)
2267       sys.exit(1)
2268     return revnum, max_score
2269
2270
2271   def _best_rev(self, scores, preferred_rev):
2272     """Return the revision with the highest score from SCORES, a list
2273     returned by _score_revisions().  When the maximum score is shared
2274     by multiple revisions, the oldest revision is selected, unless
2275     PREFERRED_REV is one of the possibilities, in which case, it is
2276     selected."""
2277     max_score = 0
2278     preferred_rev_score = -1
2279     rev = SVN_INVALID_REVNUM
2280     if preferred_rev is None:
2281       # Comparison order of different types is arbitrary. Do not
2282       # expect None to compare less than int values below.
2283       # In Python 2.3 None compares with ints like negative infinity.
2284       # In Python 2.0 None compares with ints like positive infinity.
2285       preferred_rev = SVN_INVALID_REVNUM
2286     for revnum, count in scores:
2287       if count > max_score:
2288         max_score = count
2289         rev = revnum
2290       if revnum <= preferred_rev:
2291         preferred_rev_score = count
2292     if preferred_rev_score == max_score:
2293       rev = preferred_rev
2294     return rev, max_score
2295
2296
2297   def _score_revisions(self, openings, closings):
2298     """Return a list of revisions and scores based on OPENINGS and
2299     CLOSINGS.  The returned list looks like:
2300
2301        [(REV1 SCORE1), (REV2 SCORE2), ...]
2302
2303     where REV2 > REV1.  OPENINGS and CLOSINGS are the values of
2304     self.opening__key and self.closing_key from some file or
2305     directory node, or else None.
2306
2307     Each score indicates that copying the corresponding revision (or
2308     any following revision up to the next revision in the list) of the
2309     object in question would yield that many correct paths at or
2310     underneath the object.  There may be other paths underneath it
2311     which are not correct and would need to be deleted or recopied;
2312     those can only be detected by descending and examining their
2313     scores.
2314
2315     If OPENINGS is false, return the empty list."""
2316     # First look for easy outs.
2317     if not openings:
2318       return []
2319
2320     # Must be able to call len(closings) below.
2321     if closings is None:
2322       closings = []
2323
2324     # No easy out, so wish for lexical closures and calculate the scores :-).
2325     scores = []
2326     opening_score_accum = 0
2327     for i in range(len(openings)):
2328       opening_rev, opening_score = openings[i]
2329       opening_score_accum = opening_score_accum + opening_score
2330       scores.append((opening_rev, opening_score_accum))
2331     min = 0
2332     for i in range(len(closings)):
2333       closing_rev, closing_score = closings[i]
2334       done_exact_rev = None
2335       insert_index = None
2336       insert_score = None
2337       for j in range(min, len(scores)):
2338         score_rev, score = scores[j]
2339         if score_rev >= closing_rev:
2340           if not done_exact_rev:
2341             if score_rev > closing_rev:
2342               insert_index = j
2343               insert_score = scores[j-1][1] - closing_score
2344             done_exact_rev = 1
2345           scores[j] = (score_rev, score - closing_score)
2346         else:
2347           min = j + 1
2348       if not done_exact_rev:
2349         scores.append((closing_rev,scores[-1][1] - closing_score))
2350       if insert_index is not None:
2351         scores.insert(insert_index, (closing_rev, insert_score))
2352     return scores
2353
2354   def _sum_revnum_counts(self, rev_list):
2355     """Takes an array of revisions (REV_LIST), for example:
2356
2357       [21, 18, 6, 49, 39, 24, 24, 24, 24, 24, 24, 24]
2358
2359     and adds up every occurrence of each revision and returns a sorted
2360     array of tuples containing (svn_revnum, count):
2361
2362       [(6, 1), (18, 1), (21, 1), (24, 7), (39, 1), (49, 1)]
2363     """
2364     s = {}
2365     for k in rev_list: # Add up the scores
2366       if s.has_key(k):
2367         s[k] = s[k] + 1
2368       else:
2369         s[k] = 1
2370     a = s.items()
2371     a.sort()
2372     return a
2373
2374   def _list_revnums_for_key(self, node, revnum_type_key):
2375     """Scan self.node_tree and return a list of all the revision
2376     numbers (including duplicates) contained in REVNUM_TYPE_KEY values
2377     for all leaf nodes at and under NODE.
2378
2379     REVNUM_TYPE_KEY should be either self.opening_key or
2380     self.closing_key."""
2381     revnums = []
2382
2383     # If the node has self.opening_key, it must be a leaf node--all
2384     # leaf nodes have at least an opening key (although they may not
2385     # have a closing key.  Fetch revnum and return
2386     if (self.node_tree[node].has_key(self.opening_key) and
2387         self.node_tree[node].has_key(revnum_type_key)):
2388       revnums.append(self.node_tree[node][revnum_type_key])
2389       return revnums
2390
2391     for key, node_contents in self.node_tree[node].items():
2392       if key[0] == '/':
2393         continue
2394       revnums = revnums + \
2395           self._list_revnums_for_key(node_contents, revnum_type_key)
2396     return revnums
2397
2398   def register(self, svn_path, svn_revnum, type):
2399     """Collects opening and closing revisions for this
2400     SymbolicNameFillingGuide.  SVN_PATH is the source path that needs
2401     to be copied into self.symbolic_name, and SVN_REVNUM is either the
2402     first svn revision number that we can copy from (our opening), or
2403     the last (not inclusive) svn revision number that we can copy from
2404     (our closing).  TYPE indicates whether this path is an opening or a
2405     a closing.
2406
2407     The opening for a given SVN_PATH must be passed before the closing
2408     for it to have any effect... any closing encountered before a
2409     corresponding opening will be discarded.
2410
2411     It is not necessary to pass a corresponding closing for every
2412     opening.
2413     """
2414     # Always log an OPENING
2415     if type == OPENING:
2416       self.things[svn_path] = {self.opening_key: svn_revnum}
2417     # Only log a closing if we've already registered the opening for that path.
2418     elif type == CLOSING and self.things.has_key(svn_path):
2419       # When we have a non-trunk default branch, we may have multiple
2420       # closings--only register the first closing we encounter.
2421       if not self.things[svn_path].has_key(self.closing_key):
2422         self.things[svn_path][self.closing_key] = svn_revnum
2423
2424   def make_node_tree(self):
2425     """Generates the SymbolicNameFillingGuide's node tree from
2426     self.things.  Each leaf node maps self.opening_key to the earliest
2427     subversion revision from which this node/path may be copied; and
2428     optionally map self.closing_key to the subversion revision one
2429     higher than the last revision from which this node/path may be
2430     copied.  Intermediate nodes never contain opening or closing
2431     flags."""
2432
2433     for svn_path, open_close in self.things.items():
2434       parent_key = self.root_key
2435
2436       path_so_far = ""
2437       # Walk up the path, one node at a time.
2438       components = svn_path.split('/')
2439       last_path_component = components[-1]
2440       for component in components:
2441         path_so_far = path_so_far + '/' + component
2442
2443         child_key = None
2444         if not self.node_tree[parent_key].has_key(component):
2445           child_key = gen_key()
2446           self.node_tree[child_key] = { }
2447           self.node_tree[parent_key][component] = child_key
2448         else:
2449           child_key = self.node_tree[parent_key][component]
2450
2451         # If this is the leaf, add the openings and closings.
2452         if component is last_path_component:
2453           self.node_tree[child_key] = open_close
2454         parent_key = child_key
2455     #print_node_tree(self.node_tree, self.root_key)
2456
2457   def is_empty(self):
2458     """Return true if we haven't accumulated any openings or closings,
2459     false otherwise."""
2460     return not len(self.things)
2461
2462
2463 class FillSource:
2464   """Representation of a fill source used by the symbol filler in
2465   SVNRepositoryMirror."""
2466   def __init__(self, prefix, key):
2467     """Create an unscored fill source with a prefix and a key."""
2468     self.prefix = prefix
2469     self.key = key
2470     self.score = None
2471     self.revnum = None
2472
2473   def set_score(self, score, revnum):
2474     """Set the SCORE and REVNUM."""
2475     self.score = score
2476     self.revnum = revnum
2477
2478   def __cmp__(self, other):
2479     """Comparison operator used to sort FillSources in descending
2480     score order."""
2481     if self.score is None or other.score is None:
2482       raise TypeError, 'Tried to compare unscored FillSource'
2483     return cmp(other.score, self.score)
2484
2485
2486 class SVNRepositoryMirror:
2487   """Mirror a Subversion Repository as it is constructed, one
2488   SVNCommit at a time.  The mirror is skeletal; it does not contain
2489   file contents.  The creation of a dumpfile or Subversion repository
2490   is handled by delegates.  See self.add_delegate method for how to
2491   set delegates.
2492
2493   The structure of the repository is kept in two databases and one
2494   hash.  The revs_db database maps revisions to root node keys, and
2495   the nodes_db database maps node keys to nodes.  A node is a hash
2496   from directory names to keys.  Both the revs_db and the nodes_db are
2497   stored on disk and each access is expensive.
2498
2499   The nodes_db database only has the keys for old revisions.  The
2500   revision that is being contructed is kept in memory in the new_nodes
2501   hash which is cheap to access.
2502
2503   You must invoke _start_commit between SVNCommits.
2504
2505   *** WARNING *** All path arguments to methods in this class CANNOT
2506       have leading or trailing slashes.
2507   """
2508
2509   class SVNRepositoryMirrorPathExistsError(Exception):
2510     """Exception raised if an attempt is made to add a path to the
2511     repository mirror and that path already exists in the youngest
2512     revision of the repository."""
2513     pass
2514
2515   class SVNRepositoryMirrorUnexpectedOperationError(Exception):
2516     """Exception raised if a CVSRevision is found to have an unexpected
2517     operation (OP) value."""
2518     pass
2519
2520   class SVNRepositoryMirrorInvalidFillOperationError(Exception):
2521     """Exception raised if an empty SymbolicNameFillingGuide is returned
2522     during a fill where the branch in question already exists."""
2523     pass
2524
2525   def __init__(self):
2526     """Set up the SVNRepositoryMirror and prepare it for SVNCommits."""
2527     self.delegates = [ ]
2528
2529     # This corresponds to the 'revisions' table in a Subversion fs.
2530     self.revs_db = Database(SVN_MIRROR_REVISIONS_DB, DB_OPEN_NEW)
2531     Cleanup().register(SVN_MIRROR_REVISIONS_DB, pass8)
2532
2533     # This corresponds to the 'nodes' table in a Subversion fs.  (We
2534     # don't need a 'representations' or 'strings' table because we
2535     # only track metadata, not file contents.)
2536     self.nodes_db = Database(SVN_MIRROR_NODES_DB, DB_OPEN_NEW)
2537     Cleanup().register(SVN_MIRROR_NODES_DB, pass8)
2538
2539     # Start at revision 0 without a root node.  It will be created
2540     # by _open_writable_root_node.
2541     self.youngest = 0
2542     self.new_root_key = None
2543     self.new_nodes = { }
2544
2545     if not Ctx().trunk_only:
2546       ###PERF IMPT: Suck this into memory.
2547       self.tags_db = TagsDatabase(DB_OPEN_READ)
2548       self.symbolings_reader = SymbolingsReader()
2549
2550   def _initialize_repository(self, date):
2551     """Initialize the repository by creating the directories for
2552     trunk, tags, and branches.  This method should only be called
2553     after all delegates are added to the repository mirror."""
2554     # Make a 'fake' SVNCommit so we can take advantage of the revprops
2555     # magic therein
2556     svn_commit = SVNCommit("Initialization", 1)
2557     svn_commit.set_date(date)
2558     svn_commit.set_log_msg("New repository initialized by cvs2svn.")
2559
2560     self._start_commit(svn_commit)
2561     self._mkdir(Ctx().trunk_base)
2562     if not Ctx().trunk_only:
2563       self._mkdir(Ctx().branches_base)
2564       self._mkdir(Ctx().tags_base)
2565
2566   def _start_commit(self, svn_commit):
2567     """Start a new commit."""
2568     if self.youngest > 0:
2569       self._end_commit()
2570
2571     self.youngest = svn_commit.revnum
2572     self.new_root_key = None
2573     self.new_nodes = { }
2574
2575     self._invoke_delegates('start_commit', svn_commit)
2576
2577   def _end_commit(self):
2578     """Called at the end of each commit.  This method copies the newly
2579     created nodes to the on-disk nodes db."""
2580     if self.new_root_key is None:
2581       # No changes were made in this revision, so we make the root node
2582       # of the new revision be the same as the last one.
2583       self.revs_db[str(self.youngest)] = self.revs_db[str(self.youngest - 1)]
2584     else:
2585       self.revs_db[str(self.youngest)] = self.new_root_key
2586       # Copy the new nodes to the nodes_db
2587       for key, value in self.new_nodes.items():
2588         self.nodes_db[key] = value
2589
2590   def _get_node(self, key):
2591     """Returns the node contents for KEY which may refer to either
2592     self.nodes_db or self.new_nodes."""
2593     if self.new_nodes.has_key(key):
2594       return self.new_nodes[key]
2595     else:
2596       return self.nodes_db[key]
2597
2598   def _open_readonly_node(self, path, revnum):
2599     """Open a readonly node for PATH at revision REVNUM.  Returns the
2600     node key and node contents if the path exists, else (None, None)."""
2601     # Get the root key
2602     if revnum == self.youngest:
2603       if self.new_root_key is None:
2604         node_key = self.revs_db[str(self.youngest - 1)]
2605       else:
2606         node_key = self.new_root_key
2607     else:
2608       node_key = self.revs_db[str(revnum)]
2609
2610     for component in path.split('/'):
2611       node_contents = self._get_node(node_key)
2612       if not node_contents.has_key(component):
2613         return None
2614       node_key = node_contents[component]
2615
2616     return node_key
2617
2618   def _open_writable_root_node(self):
2619     """Open a writable root node.  The current root node is returned
2620     immeditely if it is already writable.  If not, create a new one by
2621     copying the contents of the root node of the previous version."""
2622     if self.new_root_key is not None:
2623       return self.new_root_key, self.new_nodes[self.new_root_key]
2624
2625     if self.youngest < 2:
2626       new_contents = { }
2627     else:
2628       new_contents = self.nodes_db[self.revs_db[str(self.youngest - 1)]]
2629     self.new_root_key = gen_key()
2630     self.new_nodes = { self.new_root_key: new_contents }
2631
2632     return self.new_root_key, new_contents
2633
2634   def _open_writable_node(self, svn_path, create):
2635     """Open a writable node for the path SVN_PATH, creating SVN_PATH
2636     and any missing directories if CREATE is True."""
2637     parent_key, parent_contents = self._open_writable_root_node()
2638
2639     # Walk up the path, one node at a time.
2640     path_so_far = None
2641     components = svn_path.split('/')
2642     for i in range(len(components)):
2643       component = components[i]
2644       this_key = this_contents = None
2645       path_so_far = _path_join(path_so_far, component)
2646       if parent_contents.has_key(component):
2647         # The component exists.
2648         this_key = parent_contents[component]
2649         if self.new_nodes.has_key(this_key):
2650           this_contents = self.new_nodes[this_key]
2651         else:
2652           # Suck the node from the nodes_db, but update the key
2653           this_contents = self.nodes_db[this_key]
2654           this_key = gen_key()
2655           self.new_nodes[this_key] = this_contents
2656           parent_contents[component] = this_key
2657       elif create:
2658         # The component does not exists, so we create it.
2659         this_contents = { }
2660         this_key = gen_key()
2661         self.new_nodes[this_key] = this_contents
2662         parent_contents[component] = this_key
2663         if i < len(components) - 1:
2664           self._invoke_delegates('mkdir', path_so_far)
2665       else:
2666         # The component does not exists and we are not instructed to
2667         # create it, so we give up.
2668         return None, None
2669
2670       parent_key = this_key
2671       parent_contents = this_contents
2672
2673     return this_key, this_contents
2674
2675   def _path_exists(self, path):
2676     """If PATH exists in self.youngest of the svn repository mirror,
2677     return true, else return None.
2678
2679     PATH must not start with '/'."""
2680     return self._open_readonly_node(path, self.youngest) is not None
2681
2682   def _fast_delete_path(self, parent_path, parent_contents, component):
2683     """Delete COMPONENT from the parent direcory PARENT_PATH with the
2684     contents PARENT_CONTENTS.  Do nothing if COMPONENT does not exist
2685     in PARENT_CONTENTS."""
2686     if parent_contents.has_key(component):
2687       del parent_contents[component]
2688       self._invoke_delegates('delete_path', _path_join(parent_path, component))
2689
2690   def _delete_path(self, svn_path, should_prune=False):
2691     """Delete PATH from the tree.  If SHOULD_PRUNE is true, then delete
2692     all ancestor directories that are made empty when SVN_PATH is deleted.
2693     In other words, SHOULD_PRUNE is like the -P option to 'cvs checkout'.
2694
2695     NOTE: This function does *not* allow you delete top-level entries
2696     (like /trunk, /branches, /tags), nor does it prune upwards beyond
2697     those entries."""
2698     pos = svn_path.rfind('/')
2699     parent_path = svn_path[:pos]
2700     entry = svn_path[pos+1:]
2701     parent_key, parent_contents = self._open_writable_node(parent_path, False)
2702     if parent_key is not None:
2703       self._fast_delete_path(parent_path, parent_contents, entry)
2704       # The following recursion makes pruning an O(n^2) operation in the
2705       # worst case (where n is the depth of SVN_PATH), but the worst case
2706       # is probably rare, and the constant cost is pretty low.  Another
2707       # drawback is that we issue a delete for each path and not just
2708       # a single delete for the topmost directory pruned.
2709       if (should_prune and len(parent_contents) == 0 and
2710           parent_path.find('/') != -1):
2711         self._delete_path(parent_path, True)
2712
2713   def _mkdir(self, path):
2714     """Create PATH in the repository mirror at the youngest revision."""
2715     self._open_writable_node(path, True)
2716     self._invoke_delegates('mkdir', path)
2717
2718   def _change_path(self, cvs_rev):
2719     """Register a change in self.youngest for the CVS_REV's svn_path
2720     in the repository mirror."""
2721     # We do not have to update the nodes because our mirror is only
2722     # concerned with the presence or absence of paths, and a file
2723     # content change does not cause any path changes.
2724     self._invoke_delegates('change_path', cvs_rev)
2725
2726   def _add_path(self, cvs_rev):
2727     """Add the CVS_REV's svn_path to the repository mirror."""
2728     self._open_writable_node(cvs_rev.svn_path, True)
2729     self._invoke_delegates('add_path', cvs_rev)
2730
2731   def _copy_path(self, src_path, dest_path, src_revnum):
2732     """Copy SRC_PATH at subversion revision number SRC_REVNUM to
2733     DEST_PATH. In the youngest revision of the repository, DEST_PATH's
2734     parent *must* exist, but DEST_PATH *cannot* exist.
2735
2736     Return the node key and the contents of the new node at DEST_PATH
2737     as a dictionary."""
2738     # get the contents of the node of our src_path
2739     src_key = self._open_readonly_node(src_path, src_revnum)
2740     src_contents = self._get_node(src_key)
2741
2742     # Get the parent path and the base path of the dest_path
2743     pos = dest_path.rindex('/')
2744     dest_parent = dest_path[:pos]
2745     dest_basename = dest_path[pos+1:]
2746     dest_parent_key, dest_parent_contents = \
2747                    self._open_writable_node(dest_parent, False)
2748
2749     if dest_parent_contents.has_key(dest_basename):
2750       msg = "Attempt to add path '%s' to repository mirror " % dest_path
2751       msg = msg + "when it already exists in the mirror."
2752       raise self.SVNRepositoryMirrorPathExistsError, msg
2753
2754     dest_parent_contents[dest_basename] = src_key
2755     self._invoke_delegates('copy_path', src_path, dest_path, src_revnum)
2756
2757     # Yes sir, src_key and src_contents are also the contents of the
2758     # destination.  This is a cheap copy, remember!  :-)
2759     return src_key, src_contents
2760
2761   def _fill_symbolic_name(self, svn_commit):
2762     """Performs all copies necessary to create as much of the the tag
2763     or branch SVN_COMMIT.symbolic_name as possible given the current
2764     revision of the repository mirror.
2765
2766     The symbolic name is guaranteed to exist in the Subversion
2767     repository by the end of this call, even if there are no paths
2768     under it."""
2769     symbol_fill = self.symbolings_reader.filling_guide_for_symbol(
2770       svn_commit.symbolic_name, self.youngest)
2771
2772     # Create the list of sources for the symbolic name.  All source
2773     # prefixes must be direct sources for the destination, i.e. we
2774     # must have 'trunk' and 'branches/my_branch' and not just
2775     # 'branches'.
2776     sources = []
2777     for entry, key in symbol_fill.node_tree[symbol_fill.root_key].items():
2778       if entry == Ctx().trunk_base:
2779         sources.append(FillSource(entry, key))
2780       elif entry == Ctx().branches_base:
2781         for entry2, key2 in symbol_fill.node_tree[key].items():
2782           sources.append(FillSource(entry + '/' + entry2, key2))
2783       else:
2784         raise # Should never happen
2785     if self.tags_db.has_key(svn_commit.symbolic_name):
2786       dest_prefix = _path_join(Ctx().tags_base, svn_commit.symbolic_name)
2787     else:
2788       dest_prefix = _path_join(Ctx().branches_base,
2789                                svn_commit.symbolic_name)
2790
2791     if sources:
2792       dest_key = self._open_writable_node(dest_prefix, False)[0]
2793       self._fill(symbol_fill, dest_prefix, dest_key, sources)
2794     else:
2795       # We can only get here for a branch whose first commit is an add
2796       # (as opposed to a copy).
2797       dest_path = Ctx().branches_base + '/' + symbol_fill.name
2798       if not self._path_exists(dest_path):
2799         # If our symbol_fill was empty, that means that our first
2800         # commit on the branch was to a file added on the branch, and
2801         # that this is our first fill of that branch.
2802         #
2803         # This case is covered by test 16.
2804         #
2805         # ...we create the branch by copying trunk from the our
2806         # current revision number minus 1
2807         source_path = Ctx().trunk_base
2808         entries = self._copy_path(source_path, dest_path,
2809                                   svn_commit.revnum - 1)[1]
2810         # Now since we've just copied trunk to a branch that's
2811         # *supposed* to be empty, we delete any entries in the
2812         # copied directory.
2813         for entry in entries.keys():
2814           del_path = dest_path + '/' + entry
2815           # Delete but don't prune.
2816           self._delete_path(del_path)
2817       else:
2818         msg = "Error filling branch '" + symbol_fill.name + "'.\n"
2819         msg = msg + "Received an empty SymbolicNameFillingGuide and\n"
2820         msg = msg + "attempted to create a branch that already exists."
2821         raise self.SVNRepositoryMirrorInvalidFillOperationError, msg
2822
2823   def _fill(self, symbol_fill, dest_prefix, dest_key, sources,
2824             path = None, parent_source_prefix = None,
2825             preferred_revnum = None, prune_ok = None):
2826     """Fill the tag or branch at DEST_PREFIX + PATH with items from
2827     SOURCES, and recurse into the child items.
2828
2829     DEST_PREFIX is the prefix of the destination directory, e.g.
2830     '/tags/my_tag' or '/branches/my_branch', and SOURCES is a list of
2831     FillSource classes that are candidates to be copied to the
2832     destination.  DEST_KEY is the key in self.nodes_db to the
2833     destination, or None if the destination does not yet exist.
2834
2835     PATH is the path relative to DEST_PREFIX.  If PATH is None, we
2836     are at the top level, e.g. '/tags/my_tag'.
2837
2838     PARENT_SOURCE_PREFIX is the source prefix that was used to copy
2839     the parent directory, and PREFERRED_REVNUM is an int which is the
2840     source revision number that the caller (who may have copied KEY's
2841     parent) used to perform its copy.  If PREFERRED_REVNUM is None,
2842     then no revision is preferable to any other (which probably means
2843     that no copies have happened yet).
2844
2845     PRUNE_OK means that a copy has been made in this recursion, and
2846     it's safe to prune directories that are not in
2847     SYMBOL_FILL.node_tree, provided that said directory has a source
2848     prefix of one of the PARENT_SOURCE_PREFIX.
2849
2850     PATH, PARENT_SOURCE_PREFIX, PRUNE_OK, and PREFERRED_REVNUM
2851     should only be passed in by recursive calls."""
2852     # Calculate scores and revnums for all sources
2853     for source in sources:
2854       src_revnum, score = symbol_fill.get_best_revnum(source.key,
2855                                                       preferred_revnum)
2856       source.set_score(score, src_revnum)
2857
2858     # Sort the sources in descending score order so that we will make
2859     # a eventual copy from the source with the highest score.
2860     sources.sort()
2861     copy_source = sources[0]
2862
2863     src_path = _path_join(copy_source.prefix, path)
2864     dest_path = _path_join(dest_prefix, path)
2865
2866     # Figure out if we shall copy to this destination and delete any
2867     # destination path that is in the way.
2868     do_copy = 0
2869     if dest_key is None:
2870       do_copy = 1
2871     elif prune_ok and (parent_source_prefix != copy_source.prefix or
2872                        copy_source.revnum != preferred_revnum):
2873       # We are about to replace the destination, so we need to remove
2874       # it before we perform the copy.
2875       self._delete_path(dest_path)
2876       do_copy = 1
2877
2878     if do_copy:
2879       dest_key, dest_entries = self._copy_path(src_path, dest_path,
2880                                                copy_source.revnum)
2881       prune_ok = 1
2882     else:
2883       dest_entries = self._get_node(dest_key)
2884
2885     # Create the SRC_ENTRIES hash from SOURCES.  The keys are path
2886     # elements and the values are lists of FillSource classes where
2887     # this path element exists.
2888     src_entries = {}
2889     for source in sources:
2890       for entry, key in symbol_fill.node_tree[source.key].items():
2891         if entry[0] == '/': # Skip flags
2892           continue
2893         if not src_entries.has_key(entry):
2894           src_entries[entry] = []
2895         src_entries[entry].append(FillSource(source.prefix, key))
2896
2897     if prune_ok:
2898       # Delete the entries in DEST_ENTRIES that are not in src_entries.
2899       delete_list = [ ]
2900       for entry in dest_entries.keys():
2901         if not src_entries.has_key(entry):
2902           delete_list.append(entry)
2903       if delete_list:
2904         if not self.new_nodes.has_key(dest_key):
2905           dest_key, dest_entries = self._open_writable_node(dest_path, True)
2906         # Sort the delete list to get "diffable" dumpfiles.
2907         delete_list.sort()
2908         for entry in delete_list:
2909           self._fast_delete_path(dest_path, dest_entries, entry)
2910
2911     # Recurse into the SRC_ENTRIES keys sorted in alphabetical order.
2912     src_keys = src_entries.keys()
2913     src_keys.sort()
2914     for src_key in src_keys:
2915       if dest_entries.has_key(src_key):
2916         next_dest_key = dest_entries[src_key]
2917       else:
2918         next_dest_key = None
2919       self._fill(symbol_fill, dest_prefix, next_dest_key,
2920                  src_entries[src_key], _path_join(path, src_key),
2921                  copy_source.prefix, sources[0].revnum, prune_ok)
2922
2923   def _synchronize_default_branch(self, svn_commit):
2924     """Propagate any changes that happened on a non-trunk default
2925     branch to the trunk of the repository.  See
2926     CVSCommit._post_commit() for details on why this is necessary."""
2927     for cvs_rev in svn_commit.cvs_revs:
2928       if cvs_rev.op == OP_ADD or cvs_rev.op == OP_CHANGE:
2929         if self._path_exists(cvs_rev.svn_trunk_path):
2930           # Delete the path on trunk...
2931           self._delete_path(cvs_rev.svn_trunk_path)
2932         # ...and copy over from branch
2933         self._copy_path(cvs_rev.svn_path, cvs_rev.svn_trunk_path,
2934                         svn_commit.motivating_revnum)
2935       elif cvs_rev.op == OP_DELETE:
2936         # delete trunk path
2937         self._delete_path(cvs_rev.svn_trunk_path)
2938       else:
2939         msg = ("Unknown CVSRevision operation '%s' in default branch sync."
2940                % cvs_rev.op)
2941         raise self.SVNRepositoryMirrorUnexpectedOperationError, msg
2942
2943   def commit(self, svn_commit):
2944     """Add an SVNCommit to the SVNRepository, incrementing the
2945     Repository revision number, and changing the repository.  Invoke
2946     the delegates' _start_commit() method."""
2947
2948     if svn_commit.revnum == 2:
2949       self._initialize_repository(svn_commit.get_date())
2950
2951     self._start_commit(svn_commit)
2952
2953     if svn_commit.symbolic_name:
2954       Log().write(LOG_VERBOSE, "Filling symbolic name:",
2955                   svn_commit.symbolic_name)
2956       self._fill_symbolic_name(svn_commit)
2957     elif svn_commit.motivating_revnum:
2958       Log().write(LOG_VERBOSE, "Synchronizing default_branch motivated by %d"
2959                   % svn_commit.motivating_revnum)
2960       self._synchronize_default_branch(svn_commit)
2961     else: # This actually commits CVSRevisions
2962       if len(svn_commit.cvs_revs) > 1: plural = "s"
2963       else: plural = ""
2964       Log().write(LOG_VERBOSE, "Committing %d CVSRevision%s"
2965                   % (len(svn_commit.cvs_revs), plural))
2966       for cvs_rev in svn_commit.cvs_revs:
2967         # See comment in CVSCommit._commit() for what this is all
2968         # about.  Note that although asking self._path_exists() is
2969         # somewhat expensive, we only do it if the first two (cheap)
2970         # tests succeed first.
2971         if not ((cvs_rev.deltatext_code == DELTATEXT_EMPTY)
2972                 and (cvs_rev.rev == "1.1.1.1")
2973                 and self._path_exists(cvs_rev.svn_path)):
2974           if cvs_rev.op == OP_ADD:
2975             self._add_path(cvs_rev)
2976           elif cvs_rev.op == OP_CHANGE:
2977             self._change_path(cvs_rev)
2978
2979         if cvs_rev.op == OP_DELETE:
2980           self._delete_path(cvs_rev.svn_path, Ctx().prune)
2981
2982   def cleanup(self):
2983     """Callback for the Cleanup.register in self.__init__."""
2984     self.revs_db = None
2985     self.nodes_db = None
2986
2987   def add_delegate(self, delegate):
2988     """Adds DELEGATE to self.delegates.
2989
2990     For every delegate you add, as soon as SVNRepositoryMirror
2991     performs a repository action method, SVNRepositoryMirror will call
2992     the delegate's corresponding repository action method.  Multiple
2993     delegates will be called in the order that they are added.  See
2994     SVNRepositoryMirrorDelegate for more information."""
2995     self.delegates.append(delegate)
2996
2997   def _invoke_delegates(self, method, *args):
2998     """Iterate through each of our delegates, in the order that they
2999     were added, and call the delegate's method named METHOD with the
3000     arguments in ARGS."""
3001     for delegate in self.delegates:
3002       getattr(delegate, method)(*args)
3003
3004   def finish(self):
3005     """Calls the delegate finish method."""
3006     self._end_commit()
3007     self._invoke_delegates('finish')
3008     self.cleanup()
3009
3010
3011
3012 class SVNRepositoryMirrorDelegate:
3013   """Abstract superclass for any delegate to SVNRepositoryMirror.
3014   Subclasses must implement all of the methods below.
3015
3016   For each method, a subclass implements, in its own way, the
3017   Subversion operation implied by the method's name.  For example, for
3018   the add_path method, the DumpfileDelegate would write out a
3019   "Node-add:" command to a Subversion dumpfile, the StdoutDelegate
3020   would merely print that the path is being added to the repository,
3021   and the RepositoryDelegate would actually cause the path to be added
3022   to the Subversion repository that it is creating.
3023   """
3024
3025   def start_commit(self, svn_commit):
3026     """Perform any actions needed to start SVNCommit SVN_COMMIT;
3027     see subclass implementation for details."""
3028     raise NotImplementedError
3029
3030   def mkdir(self, path):
3031     """PATH is a string; see subclass implementation for details."""
3032     raise NotImplementedError
3033
3034   def add_path(self, c_rev):
3035     """C_REV is a CVSRevision; see subclass implementation for
3036     details."""
3037     raise NotImplementedError
3038
3039   def change_path(self, c_rev):
3040     """C_REV is a CVSRevision; see subclass implementation for
3041     details."""
3042     raise NotImplementedError
3043
3044   def delete_path(self, path):
3045     """PATH is a string; see subclass implementation for
3046     details."""
3047     raise NotImplementedError
3048
3049   def copy_path(self, src_path, dest_path, src_revnum):
3050     """SRC_PATH and DEST_PATH are both strings, and SRC_REVNUM is a
3051     subversion revision number (int); see subclass implementation for
3052     details."""
3053     raise NotImplementedError
3054
3055   def finish(self):
3056     """Perform any cleanup necessary after all revisions have been
3057     committed."""
3058     raise NotImplementedError
3059
3060
3061 class DumpfileDelegate(SVNRepositoryMirrorDelegate):
3062   """Create a Subversion dumpfile."""
3063
3064   def __init__(self):
3065     """Return a new DumpfileDelegate instance, attached to a dumpfile
3066     named according to Ctx().dumpfile, using Ctx().encoding.
3067
3068     If Ctx().cvs_revnums is true, then set the 'cvs2svn:cvs-revnum'
3069     property on files, when they are changed due to a corresponding
3070     CVS revision.
3071
3072     If Ctx().mime_mapper is true, then it is a MimeMapper instance, used
3073     to determine whether or not to set the 'svn:mime-type' property on
3074     files.
3075
3076     If Ctx().set_eol_style is true, then set 'svn:eol-style' to 'native'
3077     for files not marked with the CVS 'kb' flag.  (But see issue #39
3078     for how this might change.)"""
3079     self.dumpfile_path = Ctx().dumpfile
3080     self.set_cvs_revnum_properties = Ctx().cvs_revnums
3081     self.set_eol_style = Ctx().set_eol_style
3082     self.mime_mapper = Ctx().mime_mapper
3083     self.path_encoding = Ctx().encoding
3084
3085     self.dumpfile = open(self.dumpfile_path, 'wb')
3086     self._write_dumpfile_header(self.dumpfile)
3087
3088   def _write_dumpfile_header(self, dumpfile):
3089     # Initialize the dumpfile with the standard headers.
3090     #
3091     # Since the CVS repository doesn't have a UUID, and the Subversion
3092     # repository will be created with one anyway, we don't specify a
3093     # UUID in the dumpflie
3094     dumpfile.write('SVN-fs-dump-format-version: 2\n\n')
3095
3096   def _utf8_path(self, path):
3097     """Return a copy of PATH encoded in UTF-8.  PATH is assumed to be
3098     encoded in self.path_encoding."""
3099     try:
3100       # Log messages can be converted with the 'replace' strategy,
3101       # but we can't afford any lossiness here.
3102       unicode_path = unicode(path, self.path_encoding, 'strict')
3103       return unicode_path.encode('utf-8')
3104     except UnicodeError:
3105       print "Unable to convert a path '%s' to internal encoding." % path
3106       print "Consider rerunning with (for example) '--encoding=latin1'"
3107       sys.exit(1)
3108
3109   def start_commit(self, svn_commit):
3110     """Emit the start of SVN_COMMIT (an SVNCommit)."""
3111
3112     self.revision = svn_commit.revnum
3113
3114     # The start of a new commit typically looks like this:
3115     #
3116     #   Revision-number: 1
3117     #   Prop-content-length: 129
3118     #   Content-length: 129
3119     #
3120     #   K 7
3121     #   svn:log
3122     #   V 27
3123     #   Log message for revision 1.
3124     #   K 10
3125     #   svn:author
3126     #   V 7
3127     #   jrandom
3128     #   K 8
3129     #   svn:date
3130     #   V 27
3131     #   2003-04-22T22:57:58.132837Z
3132     #   PROPS-END
3133     #
3134     # Notice that the length headers count everything -- not just the
3135     # length of the data but also the lengths of the lengths, including
3136     # the 'K ' or 'V ' prefixes.
3137     #
3138     # The reason there are both Prop-content-length and Content-length
3139     # is that the former includes just props, while the latter includes
3140     # everything.  That's the generic header form for any entity in a
3141     # dumpfile.  But since revisions only have props, the two lengths
3142     # are always the same for revisions.
3143
3144     # Calculate the total length of the props section.
3145     props = svn_commit.get_revprops()
3146     total_len = 10  # len('PROPS-END\n')
3147     for propname in props.keys():
3148       if props[propname] is None:
3149         continue
3150       klen = len(propname)
3151       klen_len = len('K %d' % klen)
3152       vlen = len(props[propname])
3153       vlen_len = len('V %d' % vlen)
3154       # + 4 for the four newlines within a given property's section
3155       total_len = total_len + klen + klen_len + vlen + vlen_len + 4
3156
3157     # Print the revision header and props
3158     self.dumpfile.write('Revision-number: %d\n'
3159                         'Prop-content-length: %d\n'
3160                         'Content-length: %d\n'
3161                         '\n'
3162                         % (self.revision, total_len, total_len))
3163
3164     for propname in props.keys():
3165       if props[propname] is None:
3166         continue
3167       self.dumpfile.write('K %d\n'
3168                           '%s\n'
3169                           'V %d\n'
3170                           '%s\n' % (len(propname),
3171                                     propname,
3172                                     len(props[propname]),
3173                                     props[propname]))
3174
3175     self.dumpfile.write('PROPS-END\n')
3176     self.dumpfile.write('\n')
3177
3178   def mkdir(self, path):
3179     """Emit the creation of directory PATH."""
3180     self.dumpfile.write("Node-path: %s\n"
3181                         "Node-kind: dir\n"
3182                         "Node-action: add\n"
3183                         "Prop-content-length: 10\n"
3184                         "Content-length: 10\n"
3185                         "\n"
3186                         "PROPS-END\n"
3187                         "\n"
3188                         "\n" % self._utf8_path(path))
3189
3190   def _add_or_change_path(self, c_rev, op):
3191     """Emit the addition or change corresponding to C_REV.
3192     OP is either the constant OP_ADD or OP_CHANGE."""
3193
3194     # We begin with only a "CVS revision" property.
3195     if self.set_cvs_revnum_properties:
3196       prop_contents = 'K 15\ncvs2svn:cvs-rev\nV %d\n%s\n' \
3197                       % (len(c_rev.rev), c_rev.rev)
3198     else:
3199       prop_contents = ''
3200
3201     # Tack on the executableness, if any.
3202     if c_rev.file_executable:
3203       prop_contents = prop_contents + 'K 14\nsvn:executable\nV 1\n*\n'
3204
3205     # If the file is marked as binary, it gets a default MIME type of
3206     # "application/octet-stream".  Otherwise, it gets a default EOL
3207     # style of "native".
3208     mime_type = None
3209     eol_style = None
3210     if c_rev.mode == 'b':
3211       mime_type = 'application/octet-stream'
3212     else:
3213       eol_style = 'native'
3214
3215     # If using the MIME mapper, possibly override the default MIME
3216     # type and EOL style.
3217     if self.mime_mapper:
3218       mtype = self.mime_mapper.get_type_from_filename(c_rev.cvs_path)
3219       if mtype:
3220         mime_type = mtype
3221         if not mime_type.startswith("text/"):
3222           eol_style = None
3223
3224     # Possibly set the svn:mime-type and svn:eol-style properties.
3225     if mime_type:
3226       prop_contents = prop_contents + ('K 13\nsvn:mime-type\nV %d\n%s\n' % \
3227                                        (len(mime_type), mime_type))
3228     if self.set_eol_style and eol_style:
3229       prop_contents = prop_contents + 'K 13\nsvn:eol-style\nV 6\nnative\n'
3230
3231     # Calculate the property length (+10 for "PROPS-END\n")
3232     props_len = len(prop_contents) + 10
3233
3234     ### FIXME: We ought to notice the -kb flag set on the RCS file and
3235     ### use it to set svn:mime-type.  See issue #39.
3236     pipe_cmd = 'co -q -x,v -p%s %s' \
3237                % (c_rev.rev, escape_shell_arg(c_rev.rcs_path()))
3238     pipe = Popen3(pipe_cmd, True)
3239     pipe.tochild.close()
3240
3241     if op == OP_ADD:
3242       action = 'add'
3243     elif op == OP_CHANGE:
3244       action = 'change'
3245     else:
3246       sys.stderr.write("%s: _add_or_change_path() called with bad op ('%s')"
3247                        % (error_prefix, op))
3248       sys.exit(1)
3249
3250     self.dumpfile.write('Node-path: %s\n'
3251                         'Node-kind: file\n'
3252                         'Node-action: %s\n'
3253                         'Prop-content-length: %d\n'
3254                         'Text-content-length: '
3255                         % (self._utf8_path(c_rev.svn_path),
3256                            action, props_len))
3257
3258     pos = self.dumpfile.tell()
3259
3260     self.dumpfile.write('0000000000000000\n'
3261                         'Text-content-md5: 00000000000000000000000000000000\n'
3262                         'Content-length: 0000000000000000\n'
3263                         '\n')
3264
3265     self.dumpfile.write(prop_contents + 'PROPS-END\n')
3266
3267     # Insert the rev contents, calculating length and checksum as we go.
3268     checksum = md5.new()
3269     length = 0
3270     buf = pipe.fromchild.read(PIPE_READ_SIZE)
3271     while buf:
3272       checksum.update(buf)
3273       length = length + len(buf)
3274       self.dumpfile.write(buf)
3275       buf = pipe.fromchild.read(PIPE_READ_SIZE)
3276     pipe.fromchild.close()
3277     error_output = pipe.childerr.read()
3278     exit_status = pipe.wait()
3279     if exit_status:
3280       sys.exit("%s: The command '%s' failed with exit status: %s\n"
3281                "and the following output:\n"
3282                "%s" % (error_prefix, pipe_cmd, exit_status, error_output))
3283
3284     # Go back to patch up the length and checksum headers:
3285     self.dumpfile.seek(pos, 0)
3286     # We left 16 zeros for the text length; replace them with the real
3287     # length, padded on the left with spaces:
3288     self.dumpfile.write('%16d' % length)
3289     # 16... + 1 newline + len('Text-content-md5: ') == 35
3290     self.dumpfile.seek(pos + 35, 0)
3291     self.dumpfile.write(checksum.hexdigest())
3292     # 35... + 32 bytes of checksum + 1 newline + len('Content-length: ') == 84
3293     self.dumpfile.seek(pos + 84, 0)
3294     # The content length is the length of property data, text data,
3295     # and any metadata around/inside around them.
3296     self.dumpfile.write('%16d' % (length + props_len))
3297     # Jump back to the end of the stream
3298     self.dumpfile.seek(0, 2)
3299
3300     # This record is done (write two newlines -- one to terminate
3301     # contents that weren't themselves newline-termination, one to
3302     # provide a blank line for readability.
3303     self.dumpfile.write('\n\n')
3304
3305   def add_path(self, c_rev):
3306     """Emit the addition corresponding to C_REV, a CVSRevision."""
3307     self._add_or_change_path(c_rev, OP_ADD)
3308
3309   def change_path(self, c_rev):
3310     """Emit the change corresponding to C_REV, a CVSRevision."""
3311     self._add_or_change_path(c_rev, OP_CHANGE)
3312
3313   def delete_path(self, path):
3314     """Emit the deletion of PATH."""
3315     self.dumpfile.write('Node-path: %s\n'
3316                         'Node-action: delete\n'
3317                         '\n' % self._utf8_path(path))
3318
3319   def copy_path(self, src_path, dest_path, src_revnum):
3320     """Emit the copying of SRC_PATH at SRC_REV to DEST_PATH."""
3321     # We don't need to include "Node-kind:" for copies; the loader
3322     # ignores it anyway and just uses the source kind instead.
3323     self.dumpfile.write('Node-path: %s\n'
3324                         'Node-action: add\n'
3325                         'Node-copyfrom-rev: %d\n'
3326                         'Node-copyfrom-path: /%s\n'
3327                         '\n'
3328                         % (self._utf8_path(dest_path),
3329                            src_revnum,
3330                            self._utf8_path(src_path)))
3331
3332   def finish(self):
3333     """Perform any cleanup necessary after all revisions have been
3334     committed."""
3335     self.dumpfile.close()
3336
3337
3338 class RepositoryDelegate(DumpfileDelegate):
3339   """Creates a new Subversion Repository.  DumpfileDelegate does all
3340   of the heavy lifting."""
3341   def __init__(self):
3342     self.svnadmin = Ctx().svnadmin
3343     self.target = Ctx().target
3344     if not Ctx().existing_svnrepos:
3345       Log().write(LOG_NORMAL,"Creating new repository '%s'" % (self.target))
3346       run_command('%s create %s %s' % (self.svnadmin, Ctx().bdb_txn_nosync
3347                                        and "--bdb-txn-nosync"
3348                                        or "", self.target))
3349     DumpfileDelegate.__init__(self)
3350
3351     # This is 1 if a commit is in progress, otherwise None.
3352     self._commit_in_progress = None
3353
3354     self.dumpfile = open(self.dumpfile_path, 'w+b')
3355     self.loader_pipe = Popen3('%s load -q %s' % (self.svnadmin, self.target),
3356                               True)
3357     self.loader_pipe.fromchild.close()
3358     try:
3359       self._write_dumpfile_header(self.loader_pipe.tochild)
3360     except IOError:
3361       sys.stderr.write("%s: svnadmin failed with the following output while "
3362                        "loading the dumpfile:\n" % (error_prefix))
3363       sys.stderr.write(self.loader_pipe.childerr.read())
3364       sys.exit(1)
3365
3366   def _feed_pipe(self):
3367     """Feed the revision stored in the dumpfile to the svnadmin
3368     load pipe."""
3369     self.dumpfile.seek(0)
3370     while 1:
3371       data = self.dumpfile.read(128*1024) # Chunk size is arbitrary
3372       if not len(data):
3373         break
3374       try:
3375         self.loader_pipe.tochild.write(data)
3376       except IOError:
3377         sys.stderr.write("%s: svnadmin failed with the following output while "
3378                          "loading the dumpfile:\n" % (error_prefix))
3379         sys.stderr.write(self.loader_pipe.childerr.read())
3380         sys.exit(1)
3381
3382   def start_commit(self, svn_commit):
3383     """Start a new commit.  If a commit is already in progress, close
3384     the dumpfile, load it into the svn repository, open a new
3385     dumpfile, and write the header into it."""
3386     if self._commit_in_progress:
3387       self._feed_pipe()
3388     self.dumpfile.seek(0)
3389     self.dumpfile.truncate()
3390     DumpfileDelegate.start_commit(self, svn_commit)
3391     self._commit_in_progress = 1
3392
3393   def finish(self):
3394     """Loads the last commit into the repository."""
3395     self._feed_pipe()
3396     self.dumpfile.close()
3397     self.loader_pipe.tochild.close()
3398     error_output = self.loader_pipe.childerr.read()
3399     exit_status = self.loader_pipe.wait()
3400     if exit_status:
3401       sys.exit('%s: svnadmin load failed with exit status: %s\n'
3402                'and the following output:\n'
3403                '%s' % (error_prefix, exit_status, error_output))
3404
3405
3406 class StdoutDelegate(SVNRepositoryMirrorDelegate):
3407   """Makes no changes to the disk, but writes out information to
3408   STDOUT about what the SVNRepositoryMirror is doing.  Of course, our
3409   print statements will state that we're doing something, when in
3410   reality, we aren't doing anything other than printing out that we're
3411   doing something.  Kind of zen, really."""
3412   def __init__(self, total_revs):
3413     self.total_revs = total_revs
3414
3415   def start_commit(self, svn_commit):
3416     """Prints out the Subversion revision number of the commit that is
3417     being started."""
3418     Log().write(LOG_VERBOSE, "=" * 60)
3419     Log().write(LOG_NORMAL, "Starting Subversion commit %d / %d" %
3420                 (svn_commit.revnum, self.total_revs))
3421
3422   def mkdir(self, path):
3423     """Print a line stating that we are creating directory PATH."""
3424     Log().write(LOG_VERBOSE, "  New Directory", path)
3425
3426   def add_path(self, c_rev):
3427     """Print a line stating that we are 'adding' c_rev.svn_path."""
3428     Log().write(LOG_VERBOSE, "  Adding", c_rev.svn_path)
3429
3430   def change_path(self, c_rev):
3431     """Print a line stating that we are 'changing' c_rev.svn_path."""
3432     Log().write(LOG_VERBOSE, "  Changing", c_rev.svn_path)
3433
3434   def delete_path(self, path):
3435     """Print a line stating that we are 'deleting' PATH."""
3436     Log().write(LOG_VERBOSE, "  Deleting", path)
3437
3438   def copy_path(self, src_path, dest_path, src_revnum):
3439     """Print a line stating that we are 'copying' revision SRC_REVNUM
3440     of SRC_PATH to DEST_PATH."""
3441     Log().write(LOG_VERBOSE, "  Copying revision", src_revnum, "of", src_path)
3442     Log().write(LOG_VERBOSE, "                to", dest_path)
3443
3444   def finish(self):
3445     """State that we are done creating our repository."""
3446     Log().write(LOG_VERBOSE, "Finished creating Subversion repository.")
3447     Log().write(LOG_QUIET, "Done.")
3448
3449 # This should be a local to pass1,
3450 # but Python 2.0 does not support nested scopes.
3451 OS_SEP_PLUS_ATTIC = os.sep + 'Attic'
3452 def pass1():
3453   Log().write(LOG_QUIET, "Examining all CVS ',v' files...")
3454   cd = CollectData()
3455
3456   def visit_file(baton, dirname, files):
3457     cd = baton
3458     for fname in files:
3459       if fname[-2:] != ',v':
3460         continue
3461       cd.found_valid_file = 1
3462       pathname = os.path.join(dirname, fname)
3463       if dirname[-6:] == OS_SEP_PLUS_ATTIC:
3464         # drop the 'Attic' portion from the pathname for the canonical name.
3465         cd.set_fname(os.path.join(dirname[:-6], fname), pathname)
3466       else:
3467         cd.set_fname(pathname, pathname)
3468       Log().write(LOG_NORMAL, pathname)
3469       try:
3470         cvs2svn_rcsparse.parse(open(pathname, 'rb'), cd)
3471       except (cvs2svn_rcsparse.common.RCSParseError, ValueError, RuntimeError):
3472         err = "%s: '%s' is not a valid ,v file" \
3473               % (error_prefix, pathname)
3474         sys.stderr.write(err + '\n')
3475         cd.fatal_errors.append(err)
3476       except:
3477         Log().write(LOG_WARN, "Exception occurred while parsing %s" % pathname)
3478         raise
3479
3480   os.path.walk(Ctx().cvsroot, visit_file, cd)
3481   Log().write(LOG_VERBOSE, 'Processed', cd.num_files, 'files')
3482
3483   cd.write_symbol_db()
3484
3485   if len(cd.fatal_errors) > 0:
3486     sys.exit("Pass 1 complete.\n" + "=" * 75 + "\n"
3487              + "Error summary:\n"
3488              + "\n".join(cd.fatal_errors)
3489              + "\nExited due to fatal error(s).")
3490
3491   if cd.found_valid_file is None:
3492     sys.exit("\nNo RCS files found in your CVS Repository!\n"
3493              + "Are you absolutely certain you are pointing cvs2svn\n"
3494              + "at a CVS repository?\n"
3495              + "\nExited due to fatal error(s).")
3496
3497   Log().write(LOG_QUIET, "Done")
3498
3499 def pass2():
3500   "Pass 2: clean up the revision information."
3501
3502   symbol_db = SymbolDatabase()
3503   symbol_db.read()
3504
3505   # Convert the list of regexps to a list of strings
3506   excludes = symbol_db.find_excluded_symbols(Ctx().excludes)
3507
3508   error_detected = 0
3509
3510   Log().write(LOG_QUIET, "Checking for blocked exclusions...")
3511   blocked_excludes = symbol_db.find_blocked_excludes(excludes)
3512   if blocked_excludes:
3513     for branch, blockers in blocked_excludes.items():
3514       sys.stderr.write(error_prefix + ": The branch '%s' cannot be "
3515                        "excluded because the following symbols depend "
3516                        "on it:\n" % (branch))
3517       for blocker in blockers:
3518         sys.stderr.write("    '%s'\n" % (blocker))
3519     sys.stderr.write("\n")
3520     error_detected = 1
3521
3522   Log().write(LOG_QUIET, "Checking for forced tags with commits...")
3523   invalid_forced_tags = [ ]
3524   for forced_tag in Ctx().forced_tags:
3525     if excludes.has_key(forced_tag):
3526       continue
3527     if symbol_db.branch_has_commit(forced_tag):
3528       invalid_forced_tags.append(forced_tag)
3529   if invalid_forced_tags:
3530     sys.stderr.write(error_prefix + ": The following branches cannot be "
3531                      "forced to be tags because they have commits:\n")
3532     for tag in invalid_forced_tags:
3533       sys.stderr.write("    '%s'\n" % (tag))
3534     sys.stderr.write("\n")
3535     error_detected = 1
3536
3537   Log().write(LOG_QUIET, "Checking for tag/branch mismatches...")
3538   mismatches = symbol_db.find_mismatches(excludes)
3539   def is_not_forced(mismatch):
3540     name = mismatch[0]
3541     return not (name in Ctx().forced_tags or name in Ctx().forced_branches)
3542   mismatches = filter(is_not_forced, mismatches)
3543   if mismatches:
3544     sys.stderr.write(error_prefix + ": The following symbols are tags "
3545                      "in some files and branches in others.\nUse "
3546                      "--force-tag, --force-branch and/or --exclude to "
3547                      "resolve the symbols.\n")
3548     for name, tag_count, branch_count, commit_count in mismatches:
3549       sys.stderr.write("    '%s' is a tag in %d files, a branch in "
3550                        "%d files and has commits in %d files.\n"
3551                        % (name, tag_count, branch_count, commit_count))
3552     error_detected = 1
3553
3554   # Bail out now if we found errors
3555   if error_detected:
3556     sys.exit(1)
3557
3558   # Create the tags database
3559   tags_db = TagsDatabase(DB_OPEN_NEW)
3560   for tag in symbol_db.tags.keys():
3561     if tag not in Ctx().forced_branches:
3562       tags_db[tag] = None
3563   for tag in Ctx().forced_tags:
3564     tags_db[tag] = None
3565
3566   Log().write(LOG_QUIET, "Re-synchronizing CVS revision timestamps...")
3567
3568   # We may have recorded some changes in revisions' timestamp.  We need to
3569   # scan for any other files which may have had the same log message and
3570   # occurred at "the same time" and change their timestamps, too.
3571
3572   # read the resync data file
3573   def read_resync(fname):
3574     "Read the .resync file into memory."
3575
3576     ### note that we assume that we can hold the entire resync file in
3577     ### memory. really large repositories with whacky timestamps could
3578     ### bust this assumption. should that ever happen, then it is possible
3579     ### to split the resync file into pieces and make multiple passes,
3580     ### using each piece.
3581
3582     #
3583     # A digest maps to a sequence of lists which specify a lower and upper
3584     # time bound for matching up the commit.  We keep a sequence of these
3585     # because a number of checkins with the same log message (e.g. an empty
3586     # log message) could need to be remapped.  We also make them a list because
3587     # we will dynamically expand the lower/upper bound as we find commits
3588     # that fall into a particular msg and time range.
3589     #
3590     # resync == digest -> [ [old_time_lower, old_time_upper, new_time], ... ]
3591     #
3592     resync = { }
3593
3594     for line in fileinput.FileInput(fname):
3595       t1 = int(line[:8], 16)
3596       digest = line[9:DIGEST_END_IDX]
3597       t2 = int(line[DIGEST_END_IDX+1:], 16)
3598       t1_l = t1 - COMMIT_THRESHOLD/2
3599       t1_u = t1 + COMMIT_THRESHOLD/2
3600       if resync.has_key(digest):
3601         resync[digest].append([t1_l, t1_u, t2])
3602       else:
3603         resync[digest] = [ [t1_l, t1_u, t2] ]
3604
3605     # For each digest, sort the resync items in it in increasing order,
3606     # based on the lower time bound.
3607     digests = resync.keys()
3608     for digest in digests:
3609       (resync[digest]).sort()
3610
3611     return resync
3612
3613   resync = read_resync(DATAFILE + RESYNC_SUFFIX)
3614
3615   output = open(DATAFILE + CLEAN_REVS_SUFFIX, 'w')
3616   Cleanup().register(DATAFILE + CLEAN_REVS_SUFFIX, pass3)
3617
3618   # process the revisions file, looking for items to clean up
3619   for line in fileinput.FileInput(DATAFILE + REVS_SUFFIX):
3620     c_rev = CVSRevision(Ctx(), line[:-1])
3621
3622     # Skip this entire revision if it's on an excluded branch
3623     if excludes.has_key(c_rev.branch_name):
3624       continue
3625
3626     # Remove all references to excluded tags and branches
3627     def not_excluded(symbol, excludes=excludes):
3628       return not excludes.has_key(symbol)
3629     c_rev.branches = filter(not_excluded, c_rev.branches)
3630     c_rev.tags = filter(not_excluded, c_rev.tags)
3631
3632     # Convert all branches that are forced to be tags
3633     for forced_tag in Ctx().forced_tags:
3634       if forced_tag in c_rev.branches:
3635         c_rev.branches.remove(forced_tag)
3636         c_rev.tags.append(forced_tag)
3637
3638     # Convert all tags that are forced to be branches
3639     for forced_branch in Ctx().forced_branches:
3640       if forced_branch in c_rev.tags:
3641         c_rev.tags.remove(forced_branch)
3642         c_rev.branches.append(forced_branch)
3643
3644     if not resync.has_key(c_rev.digest):
3645       output.write(line)
3646       continue
3647
3648     # we have a hit. see if this is "near" any of the resync records we
3649     # have recorded for this digest [of the log message].
3650     for record in resync[c_rev.digest]:
3651       if record[0] <= c_rev.timestamp <= record[1]:
3652         # bingo! remap the time on this (record[2] is the new time).
3653         msg = "RESYNC: '%s' (%s): old time='%s' delta=%ds" \
3654               % (relative_name(Ctx().cvsroot, c_rev.fname),
3655                  c_rev.rev, time.ctime(c_rev.timestamp),
3656                  record[2] - c_rev.timestamp)
3657         Log().write(LOG_VERBOSE, msg)
3658
3659         # adjust the time range. we want the COMMIT_THRESHOLD from the
3660         # bounds of the earlier/latest commit in this group.
3661         record[0] = min(record[0], c_rev.timestamp - COMMIT_THRESHOLD/2)
3662         record[1] = max(record[1], c_rev.timestamp + COMMIT_THRESHOLD/2)
3663
3664         c_rev.timestamp = record[2]
3665         output.write(str(c_rev) + "\n")
3666
3667         # stop looking for hits
3668         break
3669     else:
3670       # the file/rev did not need to have its time changed.
3671       output.write(line)
3672   Log().write(LOG_QUIET, "Done")
3673
3674 def pass3():
3675   Log().write(LOG_QUIET, "Sorting CVS revisions...")
3676   sort_file(DATAFILE + CLEAN_REVS_SUFFIX,
3677             DATAFILE + SORTED_REVS_SUFFIX)
3678   Cleanup().register(DATAFILE + SORTED_REVS_SUFFIX, pass5)
3679   Log().write(LOG_QUIET, "Done")
3680
3681 def pass4():
3682   """Iterate through sorted revs, storing them in a database.
3683   If we're not doing a trunk-only conversion, generate the
3684   LastSymbolicNameDatabase, which contains the last CVSRevision
3685   that is a source for each tag or branch.
3686   """
3687   Log().write(LOG_QUIET,
3688       "Copying CVS revision data from flat file to database...")
3689   cvs_revs_db = CVSRevisionDatabase(DB_OPEN_NEW)
3690   if not Ctx().trunk_only:
3691     Log().write(LOG_QUIET,
3692         "and finding last CVS revisions for all symbolic names...")
3693     last_sym_name_db = LastSymbolicNameDatabase(DB_OPEN_NEW)
3694   else:
3695     # This is to avoid testing Ctx().trunk_only every time around the loop
3696     class DummyLSNDB:
3697       def noop(*args): pass
3698       log_revision = noop
3699       create_database = noop
3700     last_sym_name_db = DummyLSNDB()
3701
3702   for line in fileinput.FileInput(DATAFILE + SORTED_REVS_SUFFIX):
3703     c_rev = CVSRevision(Ctx(), line[:-1])
3704     cvs_revs_db.log_revision(c_rev)
3705     last_sym_name_db.log_revision(c_rev)
3706
3707   last_sym_name_db.create_database()
3708   Log().write(LOG_QUIET, "Done")
3709
3710 def pass5():
3711   """
3712   Generate the SVNCommit <-> CVSRevision mapping
3713   databases.  CVSCommit._commit also calls SymbolingsLogger to register
3714   CVSRevisions that represent an opening or closing for a path on a
3715   branch or tag.  See SymbolingsLogger for more details.
3716   """
3717   Log().write(LOG_QUIET, "Mapping CVS revisions to Subversion commits...")
3718
3719   aggregator = CVSRevisionAggregator()
3720   for line in fileinput.FileInput(DATAFILE + SORTED_REVS_SUFFIX):
3721     c_rev = CVSRevision(Ctx(), line[:-1])
3722     if not (Ctx().trunk_only and c_rev.branch_name is not None):
3723       aggregator.process_revision(c_rev)
3724   aggregator.flush()
3725
3726   Log().write(LOG_QUIET, "Done")
3727
3728 def pass6():
3729   Log().write(LOG_QUIET, "Sorting symbolic name source revisions...")
3730
3731   if not Ctx().trunk_only:
3732     sort_file(SYMBOL_OPENINGS_CLOSINGS, SYMBOL_OPENINGS_CLOSINGS_SORTED)
3733     Cleanup().register(SYMBOL_OPENINGS_CLOSINGS_SORTED, pass8)
3734   Log().write(LOG_QUIET, "Done")
3735
3736 def pass7():
3737   Log().write(LOG_QUIET, "Determining offsets for all symbolic names...")
3738
3739   def generate_offsets_for_symbolings():
3740     """This function iterates through all the lines in
3741     SYMBOL_OPENINGS_CLOSINGS_SORTED, writing out a file mapping
3742     SYMBOLIC_NAME to the file offset in SYMBOL_OPENINGS_CLOSINGS_SORTED
3743     where SYMBOLIC_NAME is first encountered.  This will allow us to
3744     seek to the various offsets in the file and sequentially read only
3745     the openings and closings that we need."""
3746
3747     ###PERF This is a fine example of a db that can be in-memory and
3748     #just flushed to disk when we're done.  Later, it can just be sucked
3749     #back into memory.
3750     offsets_db = Database(SYMBOL_OFFSETS_DB, DB_OPEN_NEW)
3751     Cleanup().register(SYMBOL_OFFSETS_DB, pass8)
3752
3753     file = open(SYMBOL_OPENINGS_CLOSINGS_SORTED, 'r')
3754     old_sym = ""
3755     while 1:
3756       fpos = file.tell()
3757       line = file.readline()
3758       if not line:
3759         break
3760       sym, svn_revnum, cvs_rev_key = line.split(" ", 2)
3761       if not sym == old_sym:
3762         Log().write(LOG_VERBOSE, " ", sym)
3763         old_sym = sym
3764         offsets_db[sym] = fpos
3765
3766   if not Ctx().trunk_only:
3767     generate_offsets_for_symbolings()
3768   Log().write(LOG_QUIET, "Done.")
3769
3770 def pass8():
3771   svncounter = 2 # Repository initialization is 1.
3772   repos = SVNRepositoryMirror()
3773   persistence_manager = PersistenceManager(DB_OPEN_READ)
3774
3775   if (Ctx().target):
3776     if not Ctx().dry_run:
3777       repos.add_delegate(RepositoryDelegate())
3778     Log().write(LOG_QUIET, "Starting Subversion Repository.")
3779   else:
3780     if not Ctx().dry_run:
3781       repos.add_delegate(DumpfileDelegate())
3782     Log().write(LOG_QUIET, "Starting Subversion Dumpfile.")
3783
3784   repos.add_delegate(StdoutDelegate(persistence_manager.total_revs() + 1))
3785
3786   while(1):
3787     svn_commit = persistence_manager.get_svn_commit(svncounter)
3788     if not svn_commit:
3789       break
3790     repos.commit(svn_commit)
3791     svncounter += 1
3792
3793   repos.finish()
3794
3795 _passes = [
3796   pass1,
3797   pass2,
3798   pass3,
3799   pass4,
3800   pass5,
3801   pass6,
3802   pass7,
3803   pass8,
3804   ]
3805
3806
3807 class Ctx:
3808   """Session state for this run of cvs2svn.py.  For example, run-time
3809   options are stored here.  This class is a Borg."""
3810   __shared_state = { }
3811   def __init__(self):
3812     self.__dict__ = self.__shared_state
3813     if self.__dict__:
3814       return
3815     # Else, initialize to defaults.
3816     self.cvsroot = None
3817     self.target = None
3818     self.dumpfile = DUMPFILE
3819     self.verbose = 0
3820     self.quiet = 0
3821     self.prune = 1
3822     self.existing_svnrepos = 0
3823     self.dump_only = 0
3824     self.dry_run = 0
3825     self.trunk_only = 0
3826     self.trunk_base = "trunk"
3827     self.tags_base = "tags"
3828     self.branches_base = "branches"
3829     self.encoding = "ascii"
3830     self.mime_types_file = None
3831     self.mime_mapper = None
3832     self.set_eol_style = 0
3833     self.svnadmin = "svnadmin"
3834     self.username = None
3835     self.print_help = 0
3836     self.skip_cleanup = 0
3837     self.cvs_revnums = 0
3838     self.bdb_txn_nosync = 0
3839     self.forced_branches = []
3840     self.forced_tags = []
3841     self.excludes = []
3842
3843 class MimeMapper:
3844   "A class that provides mappings from file names to MIME types."
3845
3846   def __init__(self):
3847     self.mappings = { }
3848     self.missing_mappings = { }
3849
3850
3851   def set_mime_types_file(self, mime_types_file):
3852     for line in fileinput.input(mime_types_file):
3853       if line.startswith("#"):
3854         continue
3855
3856       # format of a line is something like
3857       # text/plain c h cpp
3858       extensions = line.split()
3859       if len(extensions) < 2:
3860         continue
3861       type = extensions.pop(0)
3862       for ext in extensions:
3863         if self.mappings.has_key(ext) and self.mappings[ext] != type:
3864           sys.stderr.write("%s: ambiguous MIME mapping for *.%s (%s or %s)\n" \
3865                            % (warning_prefix, ext, self.mappings[ext], type))
3866         self.mappings[ext] = type
3867
3868
3869   def get_type_from_filename(self, filename):
3870     basename, extension = os.path.splitext(os.path.basename(filename))
3871
3872     # Extension includes the dot, so strip it (will leave extension
3873     # empty if filename ends with a dot, which is ok):
3874     extension = extension[1:]
3875
3876     # If there is no extension (or the file ends with a period), use
3877     # the base name for mapping.  This allows us to set mappings for
3878     # files such as README or Makefile:
3879     if not extension:
3880       extension = basename
3881     if self.mappings.has_key(extension):
3882       return self.mappings[extension]
3883     self.missing_mappings[extension] = 1
3884     return None
3885
3886
3887   def print_missing_mappings(self):
3888     for ext in self.missing_mappings.keys():
3889       sys.stderr.write("%s: no MIME mapping for *.%s\n" % (warning_prefix, ext))
3890
3891
3892 def convert(start_pass, end_pass):
3893   "Convert a CVS repository to an SVN repository."
3894
3895   if not os.path.exists(Ctx().cvsroot):
3896     sys.stderr.write(error_prefix + ': \'%s\' does not exist.\n'
3897                      % Ctx().cvsroot)
3898     sys.exit(1)
3899
3900   cleanup = Cleanup()
3901   times = [ None ] * (end_pass)
3902   for i in range(start_pass - 1, end_pass):
3903     times[i] = time.time()
3904     Log().write(LOG_QUIET, '----- pass %d -----' % (i + 1))
3905     _passes[i]()
3906     # Dispose of items in Ctx() not intended to live past the end of the pass
3907     # (Identified by exactly one leading underscore)
3908     for attr in dir(Ctx()):
3909       if (len(attr) > 2 and attr[0] == '_' and attr[1] != '_'
3910           and not attr[:6] == "_Ctx__"):
3911         delattr(Ctx(), attr)
3912     if not Ctx().skip_cleanup:
3913       cleanup.cleanup(_passes[i])
3914   times.append(time.time())
3915   Log().write(LOG_QUIET, '------------------')
3916
3917   for i in range(start_pass, end_pass + 1):
3918     Log().write(LOG_QUIET, 'pass %d: %d seconds'
3919                 % (i, int(times[i] - times[i-1])))
3920   Log().write(LOG_QUIET, ' total:',
3921               int(times[-1] - times[start_pass-1]), 'seconds')
3922
3923
3924 def usage():
3925   print 'USAGE: %s [-v] [-s svn-repos-path] [-p pass] cvs-repos-path' \
3926         % os.path.basename(sys.argv[0])
3927   print '  --help, -h           print this usage message and exit with success'
3928   print '  --version            print the version number'
3929   print '  -q                   quiet'
3930   print '  -v                   verbose'
3931   print '  -s PATH              path for SVN repos'
3932   print '  -p START[:END]       start at pass START, end at pass END of %d' % len(_passes)
3933   print '                       If only START is given, run only pass START'
3934   print '                       (implicitly enables --skip-cleanup)'
3935   print '  --existing-svnrepos  load into existing SVN repository'
3936   print '  --dumpfile=PATH      name of intermediate svn dumpfile'
3937   print '  --dry-run            do not create a repository or a dumpfile;'
3938   print '                       just print what would happen.'
3939   print '  --svnadmin=PATH      path to the svnadmin program'
3940   print '  --trunk-only         convert only trunk commits, not tags nor branches'
3941   print '  --trunk=PATH         path for trunk (default: %s)'    \
3942         % Ctx().trunk_base
3943   print '  --branches=PATH      path for branches (default: %s)' \
3944         % Ctx().branches_base
3945   print '  --tags=PATH          path for tags (default: %s)'     \
3946         % Ctx().tags_base
3947   print '  --no-prune           don\'t prune empty directories'
3948   print '  --dump-only          just produce a dumpfile, don\'t commit to a repos'
3949   print '  --encoding=ENC       encoding of log messages in CVS repos (default: %s)' \
3950         % Ctx().encoding
3951   print '  --force-branch=NAME  Force NAME to be a branch.'
3952   print '  --force-tag=NAME     Force NAME to be a tag.'
3953   print '  --exclude=REGEXP     Exclude branches and tags matching REGEXP.'
3954   print '  --username=NAME      username for cvs2svn-synthesized commits'
3955   print '  --skip-cleanup       prevent the deletion of intermediate files'
3956   print '  --bdb-txn-nosync     pass --bdb-txn-nosync to "svnadmin create"'
3957   print '  --cvs-revnums        record CVS revision numbers as file properties'
3958   print '  --mime-types=FILE    specify an apache-style mime.types file for\n' \
3959         '                       setting svn:mime-type'
3960   print '  --set-eol-style      automatically set svn:eol-style=native for\n' \
3961         '                       text files'
3962
3963
3964 def main():
3965   # Convenience var, so we don't have to keep instantiating this Borg.
3966   ctx = Ctx()
3967
3968   start_pass = 1
3969   end_pass = len(_passes)
3970
3971   try:
3972     opts, args = getopt.getopt(sys.argv[1:], 'p:s:qvh',
3973                                [ "help", "create", "trunk=",
3974                                  "username=", "existing-svnrepos",
3975                                  "branches=", "tags=", "encoding=",
3976                                  "force-branch=", "force-tag=", "exclude=",
3977                                  "mime-types=", "set-eol-style",
3978                                  "trunk-only", "no-prune", "dry-run",
3979                                  "dump-only", "dumpfile=", "svnadmin=",
3980                                  "skip-cleanup", "cvs-revnums",
3981                                  "bdb-txn-nosync", "version"])
3982   except getopt.GetoptError, e:
3983     sys.stderr.write(error_prefix + ': ' + str(e) + '\n\n')
3984     usage()
3985     sys.exit(1)
3986
3987   for opt, value in opts:
3988     if opt == '--version':
3989         print '%s version %s' % (os.path.basename(sys.argv[0]), VERSION)
3990         sys.exit(0)
3991     elif opt == '-p':
3992       # Don't cleanup if we're doing incrementals.
3993       ctx.skip_cleanup = 1
3994       if value.find(':') > 0:
3995         start_pass, end_pass = map(int, value.split(':'))
3996       else:
3997         end_pass = start_pass = int(value)
3998       if start_pass > len(_passes) or start_pass < 1:
3999         print '%s: illegal value (%d) for starting pass. '\
4000               'must be 1 through %d.' % (error_prefix, int(start_pass),
4001                                          len(_passes))
4002         sys.exit(1)
4003       if end_pass < start_pass or end_pass > len(_passes):
4004         print '%s: illegal value (%d) for ending pass. ' \
4005               'must be %d through %d.' % (error_prefix, int(end_pass),
4006                                           int(start_pass), len(_passes))
4007         sys.exit(1)
4008     elif (opt == '--help') or (opt == '-h'):
4009       ctx.print_help = 1
4010     elif opt == '-v':
4011       Log().log_level = LOG_VERBOSE
4012       ctx.verbose = 1
4013     elif opt == '-q':
4014       Log().log_level = LOG_QUIET
4015       ctx.quiet = 1
4016     elif opt == '-s':
4017       ctx.target = value
4018     elif opt == '--existing-svnrepos':
4019       ctx.existing_svnrepos = 1
4020     elif opt == '--dumpfile':
4021       ctx.dumpfile = value
4022     elif opt == '--svnadmin':
4023       ctx.svnadmin = value
4024     elif opt == '--trunk-only':
4025       ctx.trunk_only = 1
4026     elif opt == '--trunk':
4027       ctx.trunk_base = value
4028     elif opt == '--branches':
4029       ctx.branches_base = value
4030     elif opt == '--tags':
4031       ctx.tags_base = value
4032     elif opt == '--no-prune':
4033       ctx.prune = None
4034     elif opt == '--dump-only':
4035       ctx.dump_only = 1
4036     elif opt == '--dry-run':
4037       ctx.dry_run = 1
4038     elif opt == '--encoding':
4039       ctx.encoding = value
4040     elif opt == '--force-branch':
4041       ctx.forced_branches.append(value)
4042     elif opt == '--force-tag':
4043       ctx.forced_tags.append(value)
4044     elif opt == '--exclude':
4045       try:
4046         ctx.excludes.append(re.compile('^' + value + '$'))
4047       except re.error, e:
4048         sys.exit(error_prefix + ": '%s' is not a valid regexp.\n" % (value))
4049     elif opt == '--mime-types':
4050       ctx.mime_types_file = value
4051     elif opt == '--set-eol-style':
4052       ctx.set_eol_style = 1
4053     elif opt == '--username':
4054       ctx.username = value
4055     elif opt == '--skip-cleanup':
4056       ctx.skip_cleanup = 1
4057     elif opt == '--cvs-revnums':
4058       ctx.cvs_revnums = 1
4059     elif opt == '--bdb-txn-nosync':
4060       ctx.bdb_txn_nosync = 1
4061     elif opt == '--create':
4062       sys.stderr.write(warning_prefix +
4063           ': The behaviour produced by the --create option is now the '
4064           'default,\nand passing the option is deprecated.\n')
4065
4066   if ctx.print_help:
4067     usage()
4068     sys.exit(0)
4069
4070   # Consistency check for options and arguments.
4071   if len(args) == 0:
4072     usage()
4073     sys.exit(1)
4074
4075   if len(args) > 1:
4076     sys.stderr.write(error_prefix +
4077                      ": must pass only one CVS repository.\n")
4078     usage()
4079     sys.exit(1)
4080
4081   ctx.cvsroot = args[0]
4082
4083   if not os.path.isdir(ctx.cvsroot):
4084     sys.stderr.write(error_prefix +
4085                      ": the cvs-repos-path '%s' is not an "
4086                      "existing directory.\n" % ctx.cvsroot)
4087     sys.exit(1)
4088
4089   if (not ctx.target) and (not ctx.dump_only):
4090     sys.stderr.write(error_prefix +
4091                      ": must pass one of '-s' or '--dump-only'.\n")
4092     sys.exit(1)
4093
4094   def not_both(opt1val, opt1name, opt2val, opt2name):
4095     if opt1val and opt2val:
4096       sys.stderr.write(error_prefix + ": cannot pass both '%s' and '%s'.\n" \
4097           % (opt1name, opt2name))
4098       sys.exit(1)
4099
4100   not_both(ctx.target, '-s', ctx.dump_only, '--dump-only')
4101
4102   not_both(ctx.dump_only, '--dump-only',
4103     ctx.existing_svnrepos, '--existing-svnrepos')
4104
4105   not_both(ctx.bdb_txn_nosync, '--bdb-txn-nosync',
4106     ctx.existing_svnrepos, '--existing-svnrepos')
4107
4108   not_both(ctx.dump_only, '--dump-only',
4109     ctx.bdb_txn_nosync, '--bdb-txn-nosync')
4110
4111   not_both(ctx.quiet, '-q',
4112     ctx.verbose, '-v')
4113
4114   if ((string.find(ctx.trunk_base, '/') > -1)
4115       or (string.find(ctx.tags_base, '/') > -1)
4116       or (string.find(ctx.branches_base, '/') > -1)):
4117     sys.stderr.write("%s: cannot pass multicomponent path to "
4118                      "--trunk, --tags, or --branches yet.\n"
4119                      "  See http://cvs2svn.tigris.org/issues/show_bug.cgi?"
4120                      "id=7 for details.\n" % error_prefix)
4121     sys.exit(1)
4122
4123   if ctx.existing_svnrepos and not os.path.isdir(ctx.target):
4124     sys.stderr.write(error_prefix +
4125                      ": the svn-repos-path '%s' is not an "
4126                      "existing directory.\n" % ctx.target)
4127     sys.exit(1)
4128
4129   if not ctx.dump_only and not ctx.existing_svnrepos \
4130       and os.path.exists(ctx.target):
4131     sys.stderr.write(error_prefix +
4132                      ": the svn-repos-path '%s' exists.\nRemove it, or pass "
4133                      "'--existing-svnrepos'.\n" % ctx.target)
4134     sys.exit(1)
4135
4136   if ctx.mime_types_file:
4137     ctx.mime_mapper = MimeMapper()
4138     ctx.mime_mapper.set_mime_types_file(ctx.mime_types_file)
4139
4140   # Lock the current directory for temporary files.
4141   try:
4142     os.mkdir('cvs2svn.lock')
4143   except OSError, e:
4144     if e.errno == errno.EACCES:
4145       sys.stderr.write(error_prefix + ": Permission denied:"
4146                        + " No write access to output directory.\n")
4147       sys.exit(1)
4148     if e.errno == errno.EEXIST:
4149       sys.stderr.write(error_prefix +
4150           ": cvs2svn writes temporary files to the current working directory.\n"
4151           "  The directory 'cvs2svn.lock' exists, indicating that another\n"
4152           "  cvs2svn process is currently using the current directory for its\n"
4153           "  temporary workspace. If you are certain that is not the case,\n"
4154           "  remove the 'cvs2svn.lock' directory.\n")
4155       sys.exit(1)
4156     raise
4157   try:
4158     convert(start_pass, end_pass)
4159   finally:
4160     try: os.rmdir('cvs2svn.lock')
4161     except: pass
4162
4163   if ctx.mime_types_file:
4164     ctx.mime_mapper.print_missing_mappings()
4165
4166 if __name__ == '__main__':
4167   main()