cvs2svn.py

   1 #!/usr/bin/env python
   2 #
   3 # cvs2svn: ...
   4 #
   5 # $LastChangedRevision$
   6 #
   7 # ====================================================================
   8 # Copyright (c) 2000-2004 CollabNet.  All rights reserved.
   9 #
  10 # This software is licensed as described in the file COPYING, which
  11 # you should have received as part of this distribution.  The terms
  12 # are also available at http://subversion.tigris.org/license-1.html.
  13 # If newer versions of this license are posted there, you may use a
  14 # newer version instead, at your option.
  15 #
  16 # This software consists of voluntary contributions made by many
  17 # individuals.  For exact contribution history, see the revision
  18 # history and logs, available at http://cvs2svn.tigris.org/.
  19 # ====================================================================
  20
  21 import rcsparse
  22 import os
  23 import sys
  24 import sha
  25 import re
  26 import time
  27 import fileinput
  28 import string
  29 import getopt
  30 import stat
  31 import string
  32 import md5
  33 import anydbm
  34 import marshal
  35
  36 # Warnings and errors start with these strings.  They are typically
  37 # followed by a colon and a space, as in "%s: " ==> "Warning: ".
  38 warning_prefix = "Warning"
  39 error_prefix = "Error"
  40
  41 # Make sure this Python is recent enough.
  42 if sys.hexversion < 0x2000000:
  43   sys.stderr.write("'%s: Python 2.0 or higher required, "
  44                    "see www.python.org.\n" % error_prefix)
  45   sys.exit(1)
  46
  47 # Don't settle for less.
  48 if (anydbm._defaultmod.__name__ == 'dumbdbm'
  49     or anydbm._defaultmod.__name__ == 'dbm'):
  50   print 'ERROR: your installation of Python does not contain a suitable'
  51   print '  DBM module. This script cannot continue.'
  52   print '  to solve: see http://python.org/doc/current/lib/module-anydbm.html'
  53   print '  for details.'
  54   sys.exit(1)
  55
  56 if hasattr(anydbm._defaultmod, 'bsddb') \
  57     and not hasattr(anydbm._defaultmod.bsddb, '__version__'):
  58   try:
  59     gdbm = __import__('gdbm')
  60   except ImportError:
  61     sys.stderr.write(warning_prefix +
  62         ': The version of the bsddb module found '
  63         'on your computer has been reported to malfunction on some datasets, '
  64         'causing KeyError exceptions. You may wish to upgrade your Python to '
  65         'version 2.3 or later.\n')
  66   else:
  67     anydbm._defaultmod = gdbm
  68
  69 trunk_rev = re.compile('^[0-9]+\\.[0-9]+$')
  70 branch_tag = re.compile('^[0-9.]+\\.0\\.[0-9]+$')
  71 vendor_tag = re.compile('^[0-9]+\\.[0-9]+\\.[0-9]+$')
  72
  73 # This really only matches standard '1.1.1.*'-style vendor revisions.
  74 # One could conceivably have a file whose default branch is 1.1.3 or
  75 # whatever, or was that at some point in time, with vendor revisions
  76 # 1.1.3.1, 1.1.3.2, etc.  But with the default branch gone now (which
  77 # is the only time this regexp gets used), we'd have no basis for
  78 # assuming that the non-standard vendor branch had ever been the
  79 # default branch anyway, so we don't want this to match them anyway.
  80 vendor_revision = re.compile('^(1\\.1\\.1)\\.([0-9])+$')
  81
  82 DATAFILE = 'cvs2svn-data'
  83 DUMPFILE = 'cvs2svn-dump'  # The "dumpfile" we create to load into the repos
  84
  85 # Skeleton version of an svn filesystem.
  86 # See class RepositoryMirror for how these work.
  87 SVN_REVISIONS_DB = 'cvs2svn-revisions.db'
  88 NODES_DB = 'cvs2svn-nodes.db'
  89
  90 # os.popen() on Windows seems to require an access-mode string of 'rb'
  91 # in cases where the process will output binary information to stdout.
  92 # Without the 'b' we get IOErrors upon closing the pipe.  Unfortunately
  93 # 'rb' isn't accepted in the Linux version of os.popen().  As a purely
  94 # practical matter, we compensate by switching on os.name.
  95 if os.name == 'nt':
  96   PIPE_READ_MODE = 'rb'
  97   PIPE_WRITE_MODE = 'wb'
  98 else:
  99   PIPE_READ_MODE = 'r'
 100   PIPE_WRITE_MODE = 'w'
 101
 102 # Record the default RCS branches, if any, for CVS filepaths.
 103 #
 104 # The keys are CVS filepaths, relative to the top of the repository
 105 # and with the ",v" stripped off, so they match the cvs paths used in
 106 # Commit.commit().  The values are vendor branch revisions, such as
 107 # '1.1.1.1', or '1.1.1.2', or '1.1.1.96'.  The vendor branch revision
 108 # represents the highest vendor branch revision thought to have ever
 109 # been head of the default branch.
 110 #
 111 # The reason we record a specific vendor revision, rather than a
 112 # default branch number, is that there are two cases to handle:
 113 #
 114 # One case is simple.  The RCS file lists a default branch explicitly
 115 # in its header, such as '1.1.1'.  In this case, we know that every
 116 # revision on the vendor branch is to be treated as head of trunk at
 117 # that point in time.
 118 #
 119 # But there's also a degenerate case.  The RCS file does not currently
 120 # have a default branch, yet we can deduce that for some period in the
 121 # past it probably *did* have one.  For example, the file has vendor
 122 # revisions 1.1.1.1 -> 1.1.1.96, all of which are dated before 1.2,
 123 # and then it has 1.1.1.97 -> 1.1.1.100 dated after 1.2.  In this
 124 # case, we should record 1.1.1.96 as the last vendor revision to have
 125 # been the head of the default branch.
 126 DEFAULT_BRANCHES_DB = 'cvs2svn-default-branches.db'
 127
 128 # Records the origin ranges for branches and tags.
 129 # See class RepositoryMirror for how this works.
 130 SYMBOLIC_NAME_ROOTS_DB = 'cvs2svn-symroots.db'
 131
 132 # See class SymbolicNameTracker for details.
 133 SYMBOLIC_NAMES_DB = "cvs2svn-sym-names.db"
 134
 135 # Records the author and log message for each changeset.
 136 # The keys are author+log digests, the same kind used to identify
 137 # unique revisions in the .revs, etc files.  Each value is a tuple
 138 # of two elements: '(author logmessage)'.
 139 METADATA_DB = "cvs2svn-metadata.db"
 140
 141 REVS_SUFFIX = '.revs'
 142 CLEAN_REVS_SUFFIX = '.c-revs'
 143 SORTED_REVS_SUFFIX = '.s-revs'
 144 RESYNC_SUFFIX = '.resync'
 145
 146 ATTIC = os.sep + 'Attic'
 147
 148 SVN_INVALID_REVNUM = -1
 149
 150 COMMIT_THRESHOLD = 5 * 60       # flush a commit if a 5 minute gap occurs
 151
 152 # Things that can happen to a file.
 153 OP_NOOP   = '-'
 154 OP_ADD    = 'A'
 155 OP_DELETE = 'D'
 156 OP_CHANGE = 'C'
 157
 158 # A deltatext either does or doesn't represent some change.
 159 DELTATEXT_NONEMPTY = 'N'
 160 DELTATEXT_EMPTY    = 'E'
 161
 162 DIGEST_END_IDX = 9 + (sha.digestsize * 2)
 163
 164 # Officially, CVS symbolic names must use a fairly restricted set of
 165 # characters.  Unofficially, CVS 1.10 allows any character but [$,.:;@]
 166 # We don't care if some repositories out there use characters outside the
 167 # official set, as long as their tags start with a letter.
 168 # Since the unofficial set also includes [/\] we need to translate those
 169 # into ones that don't conflict with Subversion limitations.
 170 symbolic_name_re = re.compile('^[a-zA-Z].*$')
 171 symbolic_name_transtbl = string.maketrans('/\\',',;')
 172
 173 # A wrapper for anydbm that uses the marshal module to store items as
 174 # strings.
 175 class Database:
 176   def __init__(self, filename, mode):
 177     self.db = anydbm.open(filename, mode)
 178
 179   def has_key(self, key):
 180     return self.db.has_key(key)
 181
 182   def __getitem__(self, key):
 183     return marshal.loads(self.db[key])
 184
 185   def __setitem__(self, key, value):
 186     self.db[key] = marshal.dumps(value)
 187
 188   def __delitem__(self, key):
 189     del self.db[key]
 190
 191 class CollectData(rcsparse.Sink):
 192   def __init__(self, cvsroot, log_fname_base, default_branches_db):
 193     self.cvsroot = cvsroot
 194     self.revs = open(log_fname_base + REVS_SUFFIX, 'w')
 195     self.resync = open(log_fname_base + RESYNC_SUFFIX, 'w')
 196     self.default_branches_db = default_branches_db
 197     self.metadata_db = Database(METADATA_DB, 'n')
 198     self.fatal_errors = []
 199
 200     # Branch and tag label types.
 201     self.BRANCH_LABEL = 0
 202     self.VENDOR_BRANCH_LABEL = 1
 203     self.TAG_LABEL = 2
 204     # A label type to string conversion list
 205     self.LABEL_TYPES = [ 'branch', 'vendor branch', 'tag' ]
 206     # A dict mapping label names to types
 207     self.label_type = { }
 208
 209     # See set_fname() for initializations of other variables.
 210
 211   def set_fname(self, fname):
 212     "Prepare to receive data for a new file."
 213     self.fname = fname
 214
 215     # revision -> [timestamp, author, operation, old-timestamp]
 216     self.rev_data = { }
 217     self.prev = { }
 218
 219     # Hash mapping branch numbers, like '1.7.2', to branch names,
 220     # like 'Release_1_0_dev'.
 221     self.branch_names = { }
 222
 223     # Hash mapping revision numbers, like '1.7', to lists of names
 224     # indicating which branches sprout from that revision, like
 225     # ['Release_1_0_dev', 'experimental_driver', ...].
 226     self.branchlist = { }
 227
 228     # Like self.branchlist, but the values are lists of tag names that
 229     # apply to the key revision.
 230     self.taglist = { }
 231
 232     # This is always a number -- rcsparse calls this the "principal
 233     # branch", but CVS and RCS refer to it as the "default branch",
 234     # so that's what we call it, even though the rcsparse API setter
 235     # method is still 'set_principal_branch'.
 236     self.default_branch = None
 237
 238     # If the RCS file doesn't have a default branch anymore, but does
 239     # have vendor revisions, then we make an educated guess that those
 240     # revisions *were* the head of the default branch up until the
 241     # commit of 1.2, at which point the file's default branch became
 242     # trunk.  This records the date at which 1.2 was committed.
 243     self.first_non_vendor_revision_date = None
 244
 245   def set_principal_branch(self, branch):
 246     self.default_branch = branch
 247
 248   def set_branch_name(self, branch_number, name):
 249     """Record that BRANCH_NUMBER is the branch number for branch NAME,
 250     and that NAME sprouts from BRANCH_NUMBER .
 251     BRANCH_NUMBER is an RCS branch number with an odd number of components,
 252     for example '1.7.2' (never '1.7.0.2')."""
 253     if not self.branch_names.has_key(branch_number):
 254       self.branch_names[branch_number] = name
 255       # The branchlist is keyed on the revision number from which the
 256       # branch sprouts, so strip off the odd final component.
 257       sprout_rev = branch_number[:branch_number.rfind(".")]
 258       if not self.branchlist.has_key(sprout_rev):
 259         self.branchlist[sprout_rev] = []
 260       self.branchlist[sprout_rev].append(name)
 261     else:
 262       sys.stderr.write("%s: in '%s':\n"
 263                        "   branch '%s' already has name '%s',\n"
 264                        "   cannot also have name '%s', ignoring the latter\n"
 265                        % (warning_prefix, self.fname, branch_number,
 266                           self.branch_names[branch_number], name))
 267
 268   def rev_to_branch_name(self, revision):
 269     """Return the name of the branch on which REVISION lies.
 270     REVISION is a non-branch revision number with an even number of,
 271     components, for example '1.7.2.1' (never '1.7.2' nor '1.7.0.2').
 272     For the convenience of callers, REVISION can also be a trunk
 273     revision such as '1.2', in which case just return None."""
 274     if trunk_rev.match(revision):
 275       return None
 276     return self.branch_names.get(revision[:revision.rindex(".")])
 277
 278   def add_cvs_branch(self, revision, branch_name):
 279     """Record the root revision and branch revision for BRANCH_NAME,
 280     based on REVISION.  REVISION is a CVS branch number having an even
 281     number of components where the second-to-last is '0'.  For
 282     example, if it's '1.7.0.2', then record that BRANCH_NAME sprouts
 283     from 1.7 and has branch number 1.7.2."""
 284     last_dot = revision.rfind(".")
 285     branch_rev = revision[:last_dot]
 286     last2_dot = branch_rev.rfind(".")
 287     branch_rev = branch_rev[:last2_dot] + revision[last_dot:]
 288     self.set_branch_name(branch_rev, branch_name)
 289
 290   def get_tags(self, revision):
 291     """Return a list of all tag names attached to REVISION.
 292     REVISION is a regular revision number like '1.7', and the result
 293     never includes branch names, only plain tags."""
 294     return self.taglist.get(revision, [])
 295
 296   def get_branches(self, revision):
 297     """Return a list of all branch names that sprout from REVISION.
 298     REVISION is a regular revision number like '1.7'."""
 299     return self.branchlist.get(revision, [])
 300
 301   def define_tag(self, name, revision):
 302     """Record a bidirectional mapping between symbolic NAME and REVISION.
 303     REVISION is an unprocessed revision number from the RCS file's
 304     header, for example: '1.7', '1.7.0.2', or '1.1.1' or '1.1.1.1'.
 305     This function will determine what kind of symbolic name it is by
 306     inspection, and record it in the right places."""
 307     if not symbolic_name_re.match(name):
 308       sys.stderr.write("%s: in '%s':\n"
 309                        "   '%s' is not a valid tag or branch name, ignoring\n"
 310                        % (warning_prefix, self.fname, name))
 311     elif branch_tag.match(revision):
 312       label_type = self.BRANCH_LABEL
 313       self.add_cvs_branch(revision, name)
 314     elif vendor_tag.match(revision):
 315       label_type = self.VENDOR_BRANCH_LABEL
 316       self.set_branch_name(revision, name)
 317     else:
 318       label_type = self.TAG_LABEL
 319       if not self.taglist.has_key(revision):
 320         self.taglist[revision] = []
 321       self.taglist[revision].append(name)
 322
 323     try:
 324       # if label_types are different and at least one is a tag (We
 325       # don't want to error on branch/vendor branch mismatches)
 326       if (self.label_type[name] != label_type
 327           and(self.label_type[name] == self.TAG_LABEL
 328               or label_type == self.TAG_LABEL)):
 329         err = ("%s: in '%s' (BRANCH/TAG MISMATCH):\n   '%s' "
 330                " is defined as %s here, but as a %s elsewhere"
 331                % (error_prefix, self.fname, name,
 332                   self.LABEL_TYPES[label_type],
 333                   self.LABEL_TYPES[self.label_type[name]]))
 334         sys.stderr.write(err)
 335         self.fatal_errors.append(err)
 336     except KeyError:
 337       self.label_type[name] = label_type
 338
 339   def define_revision(self, revision, timestamp, author, state,
 340                       branches, next):
 341     ### what else?
 342     if state == 'dead':
 343       op = OP_DELETE
 344     else:
 345       op = OP_CHANGE
 346
 347     # store the rev_data as a list in case we have to jigger the timestamp
 348     self.rev_data[revision] = [int(timestamp), author, op, None]
 349
 350     # record the previous revision for sanity checking later
 351     if trunk_rev.match(revision):
 352       self.prev[revision] = next
 353     elif next:
 354       self.prev[next] = revision
 355     for b in branches:
 356       self.prev[b] = revision
 357
 358     # Ratchet up the highest vendor head revision, if necessary.
 359     if self.default_branch:
 360       if revision.find(self.default_branch) == 0:
 361         # This revision is on the default branch, so record that it is
 362         # the new highest vendor head revision.
 363         rel_name = relative_name(self.cvsroot, self.fname)[:-2]
 364         self.default_branches_db[rel_name] = revision
 365     else:
 366       # No default branch, so make an educated guess.
 367       if revision == '1.2':
 368         # This is probably the time when the file stopped having a
 369         # default branch, so make a note of it.
 370         self.first_non_vendor_revision_date = timestamp
 371       else:
 372         m = vendor_revision.match(revision)
 373         if m and ((not self.first_non_vendor_revision_date)
 374                   or (timestamp < self.first_non_vendor_revision_date)):
 375           # We're looking at a vendor revision, and it wasn't
 376           # committed after this file lost its default branch, so bump
 377           # the maximum trunk vendor revision in the permanent record.
 378           rel_name = relative_name(self.cvsroot, self.fname)[:-2]
 379           self.default_branches_db[rel_name] = revision
 380
 381     # Check for unlabeled branches, record them.  We tried to collect
 382     # all branch names when we parsed the symbolic name header
 383     # earlier, of course, but that didn't catch unlabeled branches.
 384     # If a branch is unlabeled, this is our first encounter with it,
 385     # so we have to record its data now.
 386     if not trunk_rev.match(revision):
 387       branch_number = revision[:revision.rindex(".")]
 388       branch_name = "unlabeled-" + branch_number
 389       if not self.branch_names.has_key(branch_number):
 390         self.set_branch_name(branch_number, branch_name)
 391
 392   def tree_completed(self):
 393     "The revision tree has been parsed. Analyze it for consistency."
 394
 395     # Our algorithm depends upon the timestamps on the revisions occuring
 396     # monotonically over time. That is, we want to see rev 1.34 occur in
 397     # time before rev 1.35. If we inserted 1.35 *first* (due to the time-
 398     # sorting), and then tried to insert 1.34, we'd be screwed.
 399
 400     # to perform the analysis, we'll simply visit all of the 'previous'
 401     # links that we have recorded and validate that the timestamp on the
 402     # previous revision is before the specified revision
 403
 404     # if we have to resync some nodes, then we restart the scan. just keep
 405     # looping as long as we need to restart.
 406     while 1:
 407       for current, prev in self.prev.items():
 408         if not prev:
 409           # no previous revision exists (i.e. the initial revision)
 410           continue
 411         t_c = self.rev_data[current][0]
 412         t_p = self.rev_data[prev][0]
 413         if t_p >= t_c:
 414           # the previous revision occurred later than the current revision.
 415           # shove the previous revision back in time (and any before it that
 416           # may need to shift).
 417           while t_p >= t_c:
 418             self.rev_data[prev][0] = t_c - 1    # new timestamp
 419             self.rev_data[prev][3] = t_p        # old timestamp
 420
 421             print "RESYNC: '%s' (%s) : old time='%s' new time='%s'" \
 422                   % (relative_name(self.cvsroot, self.fname),
 423                      prev, time.ctime(t_p), time.ctime(t_c - 1))
 424
 425             current = prev
 426             prev = self.prev[current]
 427             if not prev:
 428               break
 429             t_c = t_c - 1               # self.rev_data[current][0]
 430             t_p = self.rev_data[prev][0]
 431
 432           # break from the for-loop
 433           break
 434       else:
 435         # finished the for-loop (no resyncing was performed)
 436         return
 437
 438   def set_revision_info(self, revision, log, text):
 439     timestamp, author, op, old_ts = self.rev_data[revision]
 440     digest = sha.new(log + '\0' + author).hexdigest()
 441     if old_ts:
 442       # the timestamp on this revision was changed. log it for later
 443       # resynchronization of other files's revisions that occurred
 444       # for this time and log message.
 445       self.resync.write('%08lx %s %08lx\n' % (old_ts, digest, timestamp))
 446
 447     # "...Give back one kadam to honor the Hebrew God whose Ark this is."
 448     #       -- Imam to Indy and Sallah, in 'Raiders of the Lost Ark'
 449     #
 450     # If revision 1.1 appears to have been created via 'cvs add'
 451     # instead of 'cvs import', then this file probably never had a
 452     # default branch, so retroactively remove its record in the
 453     # default branches db.  The test is that the log message CVS uses
 454     # for 1.1 in imports is "Initial revision\n" with no period.
 455     if revision == '1.1' and log != 'Initial revision\n':
 456       rel_name = relative_name(self.cvsroot, self.fname)[:-2]
 457       if self.default_branches_db.has_key(rel_name):
 458         del self.default_branches_db[rel_name]
 459
 460     if text:
 461       deltatext_code = DELTATEXT_NONEMPTY
 462     else:
 463       deltatext_code = DELTATEXT_EMPTY
 464
 465     write_revs_line(self.revs, timestamp, digest, op, revision,
 466                     deltatext_code, self.fname,
 467                     self.rev_to_branch_name(revision),
 468                     self.get_tags(revision),
 469                     self.get_branches(revision))
 470
 471     if not self.metadata_db.has_key(digest):
 472       self.metadata_db[digest] = (author, log)
 473
 474 def run_command(command):
 475   if os.system(command):
 476     sys.exit('Command failed: "%s"' % command)
 477
 478 def make_path(ctx, path, branch_name = None, tag_name = None):
 479   """Return the trunk path, branch path, or tag path for PATH.
 480   CTX holds the name of the branches or tags directory, which is
 481   prepended to PATH when constructing a branch or tag path.
 482
 483   If PATH is empty or None, return the root trunk|branch|tag path.
 484
 485   It is an error to pass both a BRANCH_NAME and a TAG_NAME."""
 486
 487   # For a while, we treated each top-level subdir of the CVS
 488   # repository as a "project root" and interpolated the appropriate
 489   # genealogy (trunk|tag|branch) in according to the official
 490   # recommended layout.  For example, the path '/foo/bar/baz.c' on
 491   # branch 'Rel2' would become
 492   #
 493   #   /foo/branches/Rel2/bar/baz.c
 494   #
 495   # and on trunk it would become
 496   #
 497   #   /foo/trunk/bar/baz.c
 498   #
 499   # However, we went back to the older and simpler method of just
 500   # prepending the genealogy to the front, instead of interpolating.
 501   # So now we produce:
 502   #
 503   #   /branches/Rel2/foo/bar/baz.c
 504   #   /trunk/foo/bar/baz.c
 505   #
 506   # Why?  Well, Jack Repenning pointed out that this way is much
 507   # friendlier to "anonymously rooted subtrees" (that's a tree where
 508   # the name of the top level dir doesn't matter, the point is that if
 509   # you cd into it and, say, run 'make', something good will happen).
 510   # By interpolating, we made it impossible to point cvs2svn at some
 511   # subdir in the CVS repository and convert it as a project, because
 512   # we'd treat every subdir underneath it as an independent project
 513   # root, which is probably not what the user wanted.
 514   #
 515   # Also, see Blair Zajac's post
 516   #
 517   #    http://subversion.tigris.org/servlets/ReadMsg?list=dev&msgNo=38965
 518   #
 519   # and the surrounding thread, for why what people really want is a
 520   # way of specifying an in-repository prefix path, not interpolation.
 521
 522   # Check caller sanity.
 523   if branch_name and tag_name:
 524     sys.stderr.write("%s: make_path() miscalled: both branch and tag given.\n"
 525                      % error_prefix)
 526     sys.exit(1)
 527
 528   if branch_name:
 529     branch_name = branch_name.translate(symbolic_name_transtbl)
 530     if path:
 531       return ctx.branches_base + '/' + branch_name + '/' + path
 532     else:
 533       return ctx.branches_base + '/' + branch_name
 534   elif tag_name:
 535     tag_name = tag_name.translate(symbolic_name_transtbl)
 536     if path:
 537       return ctx.tags_base + '/' + tag_name + '/' + path
 538     else:
 539       return ctx.tags_base + '/' + tag_name
 540   else:
 541     if path:
 542       return ctx.trunk_base + '/' + path
 543     else:
 544       return ctx.trunk_base
 545
 546
 547 def relative_name(cvsroot, fname):
 548   l = len(cvsroot)
 549   if fname[:l] == cvsroot:
 550     if fname[l] == os.sep:
 551       return string.replace(fname[l+1:], os.sep, '/')
 552     return string.replace(fname[l:], os.sep, '/')
 553   sys.stderr.write("%s: relative_path('%s', '%s'): fname is not a sub-path of"
 554                    " cvsroot\n" % (error_prefix, cvsroot, fname))
 555   sys.exit(1)
 556
 557
 558 def visit_file(arg, dirname, files):
 559   cd, p, stats = arg
 560   for fname in files:
 561     if fname[-2:] != ',v':
 562       continue
 563     pathname = os.path.join(dirname, fname)
 564     if dirname[-6:] == ATTIC:
 565       # drop the 'Attic' portion from the pathname
 566       ### we should record this so we can easily insert it back in
 567       cd.set_fname(os.path.join(dirname[:-6], fname))
 568     else:
 569       cd.set_fname(pathname)
 570     print pathname
 571     try:
 572       p.parse(open(pathname, 'rb'), cd)
 573       stats[0] = stats[0] + 1
 574     except (rcsparse.common.RCSParseError, ValueError, RuntimeError):
 575       print "%s: '%s' is not a valid ,v file, ignoring" \
 576             % (warning_prefix, pathname)
 577     except:
 578       print "Exception occurred while parsing %s" % pathname
 579       raise
 580
 581
 582 # Return a string that has not been returned by gen_key() before.
 583 gen_key_base = 0L
 584 def gen_key():
 585   global gen_key_base
 586   key = '%x' % gen_key_base
 587   gen_key_base = gen_key_base + 1
 588   return key
 589
 590
 591 class Change:
 592   """Class for recording what actually happened when a change is made,
 593   because not all of the result is guessable by the caller.
 594   See RepositoryMirror.change_path() for more.
 595
 596   The fields are
 597
 598     op:
 599        OP_ADD path was added, OP_CHANGE if changed, or OP_NOOP if no
 600        action.
 601
 602     closed_tags:
 603        List of tags that this path can no longer be the source of,
 604        that is, tags which could be rooted in the path before the
 605        change, but not after.
 606
 607     closed_branches:
 608        Like closed_tags, but for branches.
 609
 610     deleted_entries:
 611        The list of entries deleted from the destination after
 612        copying a directory, or None.
 613
 614     copyfrom_rev:
 615        The actual revision from which the path was copied, which
 616        may be one less than the requested revision when the path
 617        was deleted in the requested revision, or None."""
 618   def __init__(self, op, closed_tags, closed_branches,
 619                deleted_entries=None, copyfrom_rev=None):
 620     self.op = op
 621     self.closed_tags = closed_tags
 622     self.closed_branches = closed_branches
 623     self.deleted_entries = deleted_entries
 624     self.copyfrom_rev = copyfrom_rev
 625
 626
 627 class RepositoryMirror:
 628   def __init__(self):
 629     # This corresponds to the 'revisions' table in a Subversion fs.
 630     self.revs_db_file = SVN_REVISIONS_DB
 631     self.revs_db = Database(self.revs_db_file, 'n')
 632
 633     # This corresponds to the 'nodes' table in a Subversion fs.  (We
 634     # don't need a 'representations' or 'strings' table because we
 635     # only track metadata, not file contents.)
 636     self.nodes_db_file = NODES_DB
 637     self.nodes_db = Database(self.nodes_db_file, 'n')
 638
 639     # This tracks which symbolic names the current "head" of a given
 640     # filepath could be the origin node for.  When the next commit on
 641     # that path comes along, we can tell which symbolic names
 642     # originated in the previous version, and signal back to the
 643     # caller that the file can no longer be the origin for those names.
 644     #
 645     # The values are tuples, (tags, branches), where each value is a
 646     # list.
 647     self.symroots_db_file = SYMBOLIC_NAME_ROOTS_DB
 648     self.symroots_db = Database(self.symroots_db_file, 'n')
 649
 650     # When copying a directory (say, to create part of a branch), we
 651     # pass change_path() a list of expected entries, so it can remove
 652     # any that are in the source but don't belong on the branch.
 653     # However, because creating a given region of a branch can involve
 654     # copying from several sources, we don't want later copy
 655     # operations to delete entries that were legitimately created by
 656     # earlier copy ops.  So after a copy, the directory records
 657     # legitimate entries under this key, in a dictionary (the keys are
 658     # entry names, the values can be ignored).
 659     self.approved_entries = "/approved-entries"
 660
 661     # Set on a directory that's mutable in the revision currently
 662     # being constructed.  (Yes, this is exactly analogous to
 663     # the Subversion filesystem code's concept of mutability.)
 664     self.mutable_flag = "/mutable"
 665     # This could represent a new mutable directory or file.
 666     self.empty_mutable_thang = { self.mutable_flag : 1 }
 667
 668     # Init a root directory with no entries at revision 0.
 669     self.youngest = 0
 670     youngest_key = gen_key()
 671     self.revs_db[str(self.youngest)] = youngest_key
 672     self.nodes_db[youngest_key] = {}
 673
 674   def new_revision(self):
 675     """Stabilize the current revision, then start the next one.
 676     (Increments youngest.)"""
 677     self.stabilize_youngest()
 678     self.revs_db[str(self.youngest + 1)] \
 679                                       = self.revs_db[str(self.youngest)]
 680     self.youngest = self.youngest + 1
 681
 682   def _stabilize_directory(self, key):
 683     """Close the directory whose node key is KEY."""
 684     dir = self.nodes_db[key]
 685     if dir.has_key(self.mutable_flag):
 686       del dir[self.mutable_flag]
 687       if dir.has_key(self.approved_entries):
 688         del dir[self.approved_entries]
 689       for entry_key in dir.keys():
 690         if not entry_key[0] == '/':
 691           self._stabilize_directory(dir[entry_key])
 692       self.nodes_db[key] = dir
 693
 694   def stabilize_youngest(self):
 695     """Stabilize the current revision by removing mutable flags."""
 696     root_key = self.revs_db[str(self.youngest)]
 697     self._stabilize_directory(root_key)
 698
 699   def probe_path(self, path, revision=-1, debugging=None):
 700     """If PATH exists in REVISION of the svn repository mirror,
 701     return its leaf value, else return None.
 702     If DEBUGGING is true, then print trace output to stdout.
 703     REVISION defaults to youngest, and PATH must not start with '/'."""
 704     components = string.split(path, '/')
 705     if revision == -1:
 706       revision = self.youngest
 707
 708     if debugging:
 709       print "PROBING path: '%s' in %d" % (path, revision)
 710
 711     parent_key = self.revs_db[str(revision)]
 712     parent = self.nodes_db[parent_key]
 713     previous_component = "/"
 714
 715     i = 1
 716     for component in components:
 717
 718       if debugging:
 719         print "  " * i,
 720         print "'%s' key: %s, val:" % (previous_component, parent_key), parent
 721
 722       if not parent.has_key(component):
 723         if debugging:
 724           print "  PROBE ABANDONED: '%s' does not contain '%s'" \
 725                 % (previous_component, component)
 726         return None
 727
 728       this_entry_key = parent[component]
 729       this_entry_val = self.nodes_db[this_entry_key]
 730       parent_key = this_entry_key
 731       parent = this_entry_val
 732       previous_component = component
 733       i = i + 1
 734
 735     if debugging:
 736       print "  " * i,
 737       print "parent_key: %s, val:" % parent_key, parent
 738
 739     # It's not actually a parent at this point, it's the leaf node.
 740     return parent
 741
 742   def change_path(self, path, tags, branches,
 743                   intermediate_dir_func=None,
 744                   copyfrom_path=None, copyfrom_rev=None,
 745                   expected_entries=None, only_if_already_exists=None):
 746     """Record a change to PATH.  PATH may not have a leading slash.
 747     Return a Change instance representing the result of the
 748     change.
 749
 750     TAGS are any tags that sprout from this revision of PATH, BRANCHES
 751     are any branches that sprout from this revision of PATH.
 752
 753     If INTERMEDIATE_DIR_FUNC is not None, then invoke it once on
 754     each full path to each missing intermediate directory in PATH, in
 755     order from shortest to longest.
 756
 757     If COPYFROM_REV and COPYFROM_PATH are not None, then they are a
 758     revision and path to record as the copyfrom sources of this node.
 759     Since this implies an add (OP_ADD), it would be reasonable to
 760     error and exit if the copyfrom args are present but the node also
 761     already exists.  Reasonable -- but not what we do :-).  The most
 762     useful behavior for callers is instead to report that nothing was
 763     done, by returning OP_NOOP for Change.op, so that's what we do.
 764
 765     It is an error for only one copyfrom argument to be present.
 766
 767     If EXPECTED_ENTRIES is not None, then it holds entries expected
 768     to be in the dst after the copy.  Any entries in the new dst but
 769     not in EXPECTED_ENTRIES are removed (ignoring keys beginning with
 770     '/'), and the removed entries returned in Change.deleted_entries,
 771     which are otherwise None.
 772
 773     No action is taken for keys in EXPECTED_ENTRIES but not in the
 774     dst; it is assumed that the caller will compensate for these by
 775     calling change_path again with other arguments.
 776
 777     If ONLY_IF_ALREADY_EXISTS is set, then do a no-op, rather than an add,
 778     if the path does not exist. This is to allow pruning using EXPECTED_ENTRIES
 779     without risking erroneously adding a path."""
 780
 781     # Check caller sanity.
 782     if ((copyfrom_rev and not copyfrom_path) or
 783         (copyfrom_path and not copyfrom_rev)):
 784       sys.stderr.write("%s: change_path() called with one copyfrom "
 785                        "argument but not the other.\n" % error_prefix)
 786       sys.exit(1)
 787
 788     components = string.split(path, '/')
 789     path_so_far = None
 790
 791     deletions = []
 792
 793     parent_key = self.revs_db[str(self.youngest)]
 794     parent = self.nodes_db[parent_key]
 795     if not parent.has_key(self.mutable_flag):
 796       parent_key = gen_key()
 797       parent[self.mutable_flag] = 1
 798       self.nodes_db[parent_key] = parent
 799       self.revs_db[str(self.youngest)] = parent_key
 800
 801     for component in components[:-1]:
 802       # parent is always mutable at the top of the loop
 803
 804       if path_so_far:
 805         path_so_far = path_so_far + '/' + component
 806       else:
 807         path_so_far = component
 808
 809       # Ensure that the parent has an entry for this component.
 810       if not parent.has_key(component):
 811         if only_if_already_exists:
 812           return Change(OP_NOOP, [], [], deletions)
 813         # else
 814         new_child_key = gen_key()
 815         parent[component] = new_child_key
 816         self.nodes_db[new_child_key] = self.empty_mutable_thang
 817         self.nodes_db[parent_key] = parent
 818         if intermediate_dir_func:
 819           intermediate_dir_func(path_so_far)
 820
 821       # One way or another, parent dir now has an entry for component,
 822       # so grab it, see if it's mutable, and DTRT if it's not.  (Note
 823       # it's important to reread the entry value from the db, even
 824       # though we might have just written it -- if we tweak existing
 825       # data structures, we could modify self.empty_mutable_thang,
 826       # which must not happen.)
 827       this_entry_key = parent[component]
 828       this_entry_val = self.nodes_db[this_entry_key]
 829       if not this_entry_val.has_key(self.mutable_flag):
 830         this_entry_val[self.mutable_flag] = 1
 831         this_entry_key = gen_key()
 832         parent[component] = this_entry_key
 833         self.nodes_db[this_entry_key] = this_entry_val
 834         self.nodes_db[parent_key] = parent
 835
 836       parent_key = this_entry_key
 837       parent = this_entry_val
 838
 839     # Now change the last node, the versioned file.  Just like at the
 840     # top of the above loop, parent is already mutable.
 841     op = OP_ADD
 842     if self.symroots_db.has_key(path):
 843       old_names = self.symroots_db[path]
 844     else:
 845       old_names = [], []
 846     last_component = components[-1]
 847     new_val = { }
 848     if parent.has_key(last_component):
 849       # The contract for copying over existing nodes is to do nothing
 850       # and return:
 851       if copyfrom_path:
 852         return Change(OP_NOOP, old_names[0], old_names[1], deletions)
 853       # else
 854       op = OP_CHANGE
 855       new_val = self.nodes_db[parent[last_component]]
 856     elif only_if_already_exists:
 857       return Change(OP_NOOP, [], [], deletions)
 858
 859     leaf_key = gen_key()
 860     if copyfrom_path:
 861       new_val = self.probe_path(copyfrom_path, copyfrom_rev)
 862       if new_val is None:
 863         # Sometimes a branch is rooted in a revision that RCS has
 864         # marked as 'dead'. There is no reason to assume that the
 865         # current path shares any history with any older live parent
 866         # of the dead revision, so we do nothing and return.
 867         return Change(OP_NOOP, [], [], deletions)
 868     if expected_entries is not None:
 869       # If it is not None, then even if it is an empty list/tuple,
 870       # we need to approve this item in its parent's approved entries list.
 871       approved_entries = parent.get(self.approved_entries) or {}
 872       approved_entries[last_component] = 1
 873       parent[self.approved_entries] = approved_entries
 874     if expected_entries:
 875       approved_entries = new_val.get(self.approved_entries) or { }
 876       new_approved_entries = { }
 877       for ent in new_val.keys():
 878         if (ent[0] != '/'):
 879           if (not expected_entries.has_key(ent)
 880               and not approved_entries.has_key(ent)):
 881             del new_val[ent]
 882             deletions.append(ent)
 883           else:
 884             new_approved_entries[ent] = 1
 885       new_val[self.approved_entries] = new_approved_entries
 886     parent[last_component] = leaf_key
 887     self.nodes_db[parent_key] = parent
 888     self.symroots_db[path] = (tags, branches)
 889     new_val[self.mutable_flag] = 1
 890     self.nodes_db[leaf_key] = new_val
 891
 892     return Change(op, old_names[0], old_names[1], deletions, copyfrom_rev)
 893
 894   def delete_path(self, path, tags, branches, prune=None):
 895     """Delete PATH from the tree.  PATH may not have a leading slash.
 896
 897     Return a tuple (path_deleted, closed_tags, closed_branches), where
 898     path_deleted is the path actually deleted or None if PATH did not
 899     exist, and closed_tags and closed_branches are lists of symbolic
 900     names closed off by this deletion -- that is, tags or branches
 901     which could be rooted in the previous revision of PATH, but not in
 902     this revision, because this rev changes PATH.  If path_deleted is
 903     None, then closed_tags and closed_branches will both be empty.
 904
 905     TAGS are any tags that sprout from this revision of PATH, BRANCHES
 906     are any branches that sprout from this revision of PATH.  (I can't
 907     imagine that there are any of either, what to do if there are?)
 908
 909     If PRUNE is not None, then delete the highest possible directory,
 910     which means the returned path may differ from PATH.  In other
 911     words, if PATH was the last entry in its parent, then delete
 912     PATH's parent, unless it too is the last entry in *its* parent, in
 913     which case delete that parent, and so on up the chain, until a
 914     directory is encountered that has an entry which is not a member
 915     of the parent stack of the original target.
 916
 917     NOTE: This function does *not* allow you delete top-level entries
 918     (like /trunk, /branches, /tags), not does it prune upwards beyond
 919     those entries.
 920
 921     PRUNE is like the -P option to 'cvs checkout'."""
 922
 923     components = string.split(path, '/')
 924     path_so_far = None
 925
 926     parent_key = self.revs_db[str(self.youngest)]
 927     parent = self.nodes_db[parent_key]
 928
 929     # As we walk down to find the dest, we remember each parent
 930     # directory's name and db key, in reverse order: push each new key
 931     # onto the front of the list, so that by the time we reach the
 932     # destination node, the zeroth item in the list is the parent of
 933     # that destination.
 934     #
 935     # Then if we actually do the deletion, we walk the list from left
 936     # to right, replacing as appropriate.
 937     #
 938     # The root directory has name None.
 939     parent_chain = [ ]
 940     parent_chain.insert(0, (None, parent_key))
 941
 942     def is_prunable(dir):
 943       """Return true if DIR, a dictionary representing a directory,
 944       has just zero or one non-special entry, else return false.
 945       (In a pure world, we'd just ask len(DIR) > 1; it's only
 946       because the directory might have mutable flags and other special
 947       entries that we need this function at all.)"""
 948       num_items = len(dir)
 949       if num_items > 3:
 950         return None
 951       if num_items == 3 or num_items == 2:
 952         real_entries = 0
 953         for key in dir.keys():
 954           if not key[0] == '/': real_entries = real_entries + 1
 955         if real_entries > 1:
 956           return None
 957         else:
 958           return 1
 959       else:
 960         return 1
 961
 962     # We never prune our top-level directories (/trunk, /tags, /branches)
 963     if len(components) < 2:
 964       return None, [], []
 965
 966     for component in components[:-1]:
 967       if path_so_far:
 968         path_so_far = path_so_far + '/' + component
 969       else:
 970         path_so_far = component
 971
 972       # If we can't reach the dest, then we don't need to do anything.
 973       if not parent.has_key(component):
 974         return None, [], []
 975
 976       # Otherwise continue downward, dropping breadcrumbs.
 977       this_entry_key = parent[component]
 978       this_entry_val = self.nodes_db[this_entry_key]
 979       parent_key = this_entry_key
 980       parent = this_entry_val
 981       parent_chain.insert(0, (component, parent_key))
 982
 983     # If the target is not present in its parent, then we're done.
 984     last_component = components[-1]
 985     old_names = [], []
 986     if not parent.has_key(last_component):
 987       return None, [], []
 988     elif self.symroots_db.has_key(path):
 989       old_names = self.symroots_db[path]
 990       del self.symroots_db[path]
 991
 992     # The target is present, so remove it and bubble up, making a new
 993     # mutable path and/or pruning as necessary.
 994     pruned_count = 0
 995     prev_entry_name = last_component
 996     new_key = None
 997     for parent_item in parent_chain:
 998       pkey = parent_item[1]
 999       pval = self.nodes_db[pkey]
1000
1001       # If we're pruning at all, and we're looking at a prunable thing
1002       # (and that thing isn't one of our top-level directories --
1003       # trunk, tags, branches) ...
1004       if prune and (new_key is None) and is_prunable(pval) \
1005          and parent_item != parent_chain[-2]:
1006         # ... then up our count of pruned items, and do nothing more.
1007         # All the action takes place when we hit a non-prunable
1008         # parent.
1009         pruned_count = pruned_count + 1
1010       else:
1011         # Else, we've hit a non-prunable, or aren't pruning, so bubble
1012         # up the new gospel.
1013         pval[self.mutable_flag] = 1
1014         if new_key is None:
1015           del pval[prev_entry_name]
1016         else:
1017           pval[prev_entry_name] = new_key
1018         new_key = gen_key()
1019
1020       prev_entry_name = parent_item[0]
1021       if new_key:
1022         self.nodes_db[new_key] = pval
1023
1024     if new_key is None:
1025       new_key = gen_key()
1026       self.nodes_db[new_key] = self.empty_mutable_thang
1027
1028     # Install the new root entry.
1029     self.revs_db[str(self.youngest)] = new_key
1030
1031     # Sanity check -- this should be a "can't happen".
1032     if pruned_count > len(components):
1033       sys.stderr.write("%s: deleting '%s' tried to prune %d components.\n"
1034                        % (error_prefix, path, pruned_count))
1035       sys.exit(1)
1036
1037     if pruned_count:
1038       if pruned_count == len(components):
1039         # We never prune away the root directory, so back up one component.
1040         pruned_count = pruned_count - 1
1041       retpath = string.join(components[:0 - pruned_count], '/')
1042     else:
1043       retpath = path
1044
1045     return retpath, old_names[0], old_names[1]
1046
1047     ### We've no place to put tags + branches.  Suspect we just
1048     ### shouldn't be taking them as arguments, which the doc string
1049     ### implies already.  Ponder.
1050
1051   def close(self):
1052     # Just stabilize the last revision.  This may or may not affect
1053     # anything, but if we end up using the mirror for anything after
1054     # this, it's nice to know the '/mutable' entries are gone.
1055     self.stabilize_youngest()
1056
1057 if sys.platform == "win32":
1058   def escape_shell_arg(str):
1059     return '"' + string.replace(str, '"', '"^""') + '"'
1060 else:
1061   def escape_shell_arg(str):
1062     return "'" + string.replace(str, "'", "'\\''") + "'"
1063
1064 class Dumper:
1065   def __init__(self, ctx):
1066     'Open DUMPFILE_PATH, and initialize revision to REVISION.'
1067     self.dumpfile_path = ctx.dumpfile
1068     self.revision = 0
1069     self.repos_mirror = RepositoryMirror()
1070     self.svnadmin = ctx.svnadmin
1071     self.target = ctx.target
1072     self.dump_only = ctx.dump_only
1073     self.dumpfile = None
1074     self.path_encoding = ctx.encoding
1075     self.loader_pipe = None
1076
1077     # If all we're doing here is dumping, we can go ahead and
1078     # initialize our single dumpfile.  Else, if we're suppose to
1079     # create the repository, do so.
1080     if self.dump_only:
1081       self.init_dumpfile()
1082       self.write_dumpfile_header(self.dumpfile)
1083     else:
1084       if not ctx.existing_svnrepos:
1085         print "creating repos '%s'" % (self.target)
1086         run_command('%s create %s %s' % (self.svnadmin, ctx.bdb_txn_nosync
1087           and "--bdb-txn-nosync" or "", self.target))
1088       self.loader_pipe = os.popen('%s load -q %s' %
1089           (self.svnadmin, self.target), PIPE_WRITE_MODE)
1090       self.write_dumpfile_header(self.loader_pipe)
1091
1092
1093   def init_dumpfile(self):
1094     # Open the dumpfile for binary-mode write.
1095     self.dumpfile = open(self.dumpfile_path, 'wb')
1096
1097
1098   def write_dumpfile_header(self, fileobj):
1099     # Initialize the dumpfile with the standard headers:
1100     #
1101     # The CVS repository doesn't have a UUID, and the Subversion
1102     # repository will be created with one anyway.  So when we load
1103     # the dumpfile, we don't specify a UUID.
1104     fileobj.write('SVN-fs-dump-format-version: 2\n\n')
1105
1106   def flush_and_remove_dumpfile(self):
1107     if self.dumpfile is None:
1108       return
1109     self.dumpfile.close()
1110     print "piping revision %d into '%s' loader" % (self.revision, self.target)
1111     dumpfile = open(self.dumpfile_path, 'rb')
1112     while 1:
1113       data = dumpfile.read(1024*1024) # Choice of 1MB chunks is arbitrary
1114       if not len(data): break
1115       self.loader_pipe.write(data)
1116     dumpfile.close()
1117
1118     os.remove(self.dumpfile_path)
1119
1120   def start_revision(self, props):
1121     """Write the next revision, with properties, to the dumpfile.
1122     Return the newly started revision."""
1123
1124     # If this is not a --dump-only, we need to flush (load into the
1125     # repository) any dumpfile data we have already written and the
1126     # init a new dumpfile before starting this revision.
1127
1128     if not self.dump_only:
1129       if self.revision > 0:
1130         self.flush_and_remove_dumpfile()
1131       self.init_dumpfile()
1132
1133     self.revision = self.revision + 1
1134
1135     # A revision typically looks like this:
1136     #
1137     #   Revision-number: 1
1138     #   Prop-content-length: 129
1139     #   Content-length: 129
1140     #
1141     #   K 7
1142     #   svn:log
1143     #   V 27
1144     #   Log message for revision 1.
1145     #   K 10
1146     #   svn:author
1147     #   V 7
1148     #   jrandom
1149     #   K 8
1150     #   svn:date
1151     #   V 27
1152     #   2003-04-22T22:57:58.132837Z
1153     #   PROPS-END
1154     #
1155     # Notice that the length headers count everything -- not just the
1156     # length of the data but also the lengths of the lengths, including
1157     # the 'K ' or 'V ' prefixes.
1158     #
1159     # The reason there are both Prop-content-length and Content-length
1160     # is that the former includes just props, while the latter includes
1161     # everything.  That's the generic header form for any entity in a
1162     # dumpfile.  But since revisions only have props, the two lengths
1163     # are always the same for revisions.
1164
1165     # Calculate the total length of the props section.
1166     total_len = 10  # len('PROPS-END\n')
1167     for propname in props.keys():
1168       klen = len(propname)
1169       klen_len = len('K %d' % klen)
1170       vlen = len(props[propname])
1171       vlen_len = len('V %d' % vlen)
1172       # + 4 for the four newlines within a given property's section
1173       total_len = total_len + klen + klen_len + vlen + vlen_len + 4
1174
1175     # Print the revision header and props
1176     self.dumpfile.write('Revision-number: %d\n'
1177                         'Prop-content-length: %d\n'
1178                         'Content-length: %d\n'
1179                         '\n'
1180                         % (self.revision, total_len, total_len))
1181
1182     for propname in props.keys():
1183       self.dumpfile.write('K %d\n'
1184                           '%s\n'
1185                           'V %d\n'
1186                           '%s\n' % (len(propname),
1187                                     propname,
1188                                     len(props[propname]),
1189                                     props[propname]))
1190
1191     self.dumpfile.write('PROPS-END\n')
1192     self.dumpfile.write('\n')
1193
1194     self.repos_mirror.new_revision()
1195     return self.revision
1196
1197   def add_dir(self, path):
1198     self.dumpfile.write("Node-path: %s\n"
1199                         "Node-kind: dir\n"
1200                         "Node-action: add\n"
1201                         "Prop-content-length: 10\n"
1202                         "Content-length: 10\n"
1203                         "\n"
1204                         "PROPS-END\n"
1205                         "\n"
1206                         "\n" % self.utf8_path(path))
1207
1208   def utf8_path(self, path):
1209     """Return UTF-8 encoded 'path' based on ctx.path_encoding."""
1210     try:
1211       ### Log messages can be converted with 'replace' strategy.
1212       ### We can't afford that here.
1213       unicode_path = unicode(path, self.path_encoding, 'strict')
1214       return unicode_path.encode('utf-8')
1215
1216     except UnicodeError:
1217       print "Unable to convert a path '%s' to internal encoding." % path
1218       print "Consider rerunning with (for example) '--encoding=latin1'"
1219       sys.exit(1)
1220
1221
1222   def probe_path(self, path):
1223     """Return true if PATH exists in the youngest tree of the svn
1224     repository, else return None.  PATH does not start with '/'."""
1225     if self.repos_mirror.probe_path(path) is None:
1226       return None
1227     else:
1228       return 1
1229
1230   def copy_path(self, svn_src_path, svn_src_rev, svn_dst_path, entries=None):
1231     """If it wouldn't be redundant to do so, emit a copy of SVN_SRC_PATH at
1232     SVN_SRC_REV to SVN_DST_PATH.
1233
1234     Return 1 if the copy was done, None otherwise.
1235
1236     If ENTRIES is not None, it is a dictionary whose keys are the full
1237     set of entries the new copy is expected to have -- and therefore
1238     any entries in the new dst but not in ENTRIES will be removed.
1239     (Keys in ENTRIES beginning with '/' are ignored.)
1240
1241     No action is taken for keys in ENTRIES but not in the dst; it is
1242     assumed that the caller will compensate for these by calling
1243     copy_path again with other arguments."""
1244     change = self.repos_mirror.change_path(svn_dst_path,
1245                                            [], [],
1246                                            self.add_dir,
1247                                            svn_src_path, svn_src_rev,
1248                                            entries)
1249     if change.op == OP_ADD:
1250       if change.copyfrom_rev >= self.revision:
1251         sys.stderr.write("%s: invalid copyfrom revision %d used while\n"
1252                          "creating revision %d in dumpfile.\n"
1253                          % (error_prefix, change.copyfrom_rev, self.revision))
1254         sys.exit(1)
1255
1256       # We don't need to include "Node-kind:" for copies; the loader
1257       # ignores it anyway and just uses the source kind instead.
1258       self.dumpfile.write('Node-path: %s\n'
1259                           'Node-action: add\n'
1260                           'Node-copyfrom-rev: %d\n'
1261                           'Node-copyfrom-path: /%s\n'
1262                           '\n'
1263                           % (self.utf8_path(svn_dst_path),
1264                              change.copyfrom_rev,
1265                              self.utf8_path(svn_src_path)))
1266
1267       for ent in change.deleted_entries:
1268         self.dumpfile.write('Node-path: %s\n'
1269                             'Node-action: delete\n'
1270                             '\n' % (self.utf8_path(svn_dst_path + '/' + ent)))
1271       return 1
1272     return None
1273
1274   def prune_entries(self, path, expected):
1275     """Delete any entries in PATH that are not in list EXPECTED.
1276     PATH need not be a directory, but of course nothing will happen if
1277     it's a file.  Entries beginning with '/' are ignored as usual."""
1278     change = self.repos_mirror.change_path(path,
1279                                            [], [],
1280                                            self.add_dir,
1281                                            None, None,
1282                                            expected, 1)
1283     for ent in change.deleted_entries:
1284       self.dumpfile.write('Node-path: %s\n'
1285                           'Node-action: delete\n'
1286                           '\n' % (self.utf8_path(path + '/' + ent)))
1287
1288   def add_or_change_path(self, cvs_path, svn_path, cvs_rev, rcs_file,
1289                          tags, branches, cvs_revnums):
1290
1291     # figure out the real file path for "co"
1292     try:
1293       f_st = os.stat(rcs_file)
1294     except os.error:
1295       dirname, fname = os.path.split(rcs_file)
1296       rcs_file = os.path.join(dirname, 'Attic', fname)
1297       f_st = os.stat(rcs_file)
1298
1299     # We begin with only a "CVS revision" property.
1300     if cvs_revnums:
1301       prop_contents = 'K 15\ncvs2svn:cvs-rev\nV %d\n%s\n' \
1302                       % (len(cvs_rev), cvs_rev)
1303     else:
1304       prop_contents = ''
1305
1306     # Check for executable-ness.
1307     if f_st[0] & stat.S_IXUSR:
1308       prop_contents = prop_contents + 'K 14\nsvn:executable\nV 1\n*\n'
1309
1310     # Calculate the property length (+10 for "PROPS-END\n")
1311     props_len = len(prop_contents) + 10
1312
1313     ### FIXME: We ought to notice the -kb flag set on the RCS file and
1314     ### use it to set svn:mime-type.
1315
1316     basename = os.path.basename(rcs_file[:-2])
1317     pipe_cmd = 'co -q -x,v -p%s %s' % (cvs_rev, escape_shell_arg(rcs_file))
1318     pipe = os.popen(pipe_cmd, PIPE_READ_MODE)
1319
1320     # You might think we could just test
1321     #
1322     #   if cvs_rev[-2:] == '.1':
1323     #
1324     # to determine if this path exists in head yet.  But that wouldn't
1325     # be perfectly reliable, both because of 'cvs commit -r', and also
1326     # the possibility of file resurrection.
1327     change = self.repos_mirror.change_path(svn_path, tags, branches,
1328                                            self.add_dir)
1329
1330     if change.op == OP_ADD:
1331       action = 'add'
1332     else:
1333       action = 'change'
1334
1335     self.dumpfile.write('Node-path: %s\n'
1336                         'Node-kind: file\n'
1337                         'Node-action: %s\n'
1338                         'Prop-content-length: %d\n'
1339                         'Text-content-length: '
1340                         % (self.utf8_path(svn_path), action, props_len))
1341
1342     pos = self.dumpfile.tell()
1343
1344     self.dumpfile.write('0000000000000000\n'
1345                         'Text-content-md5: 00000000000000000000000000000000\n'
1346                         'Content-length: 0000000000000000\n'
1347                         '\n')
1348
1349     self.dumpfile.write(prop_contents + 'PROPS-END\n')
1350
1351     # Insert the rev contents, calculating length and checksum as we go.
1352     checksum = md5.new()
1353     length = 0
1354     buf = pipe.read()
1355     while buf:
1356       checksum.update(buf)
1357       length = length + len(buf)
1358       self.dumpfile.write(buf)
1359       buf = pipe.read()
1360     if pipe.close() is not None:
1361       sys.exit('%s: Command failed: "%s"' % (error_prefix, pipe_cmd))
1362
1363     # Go back to patch up the length and checksum headers:
1364     self.dumpfile.seek(pos, 0)
1365     # We left 16 zeros for the text length; replace them with the real
1366     # length, padded on the left with spaces:
1367     self.dumpfile.write('%16d' % length)
1368     # 16... + 1 newline + len('Text-content-md5: ') == 35
1369     self.dumpfile.seek(pos + 35, 0)
1370     self.dumpfile.write(checksum.hexdigest())
1371     # 35... + 32 bytes of checksum + 1 newline + len('Content-length: ') == 84
1372     self.dumpfile.seek(pos + 84, 0)
1373     # The content length is the length of property data, text data,
1374     # and any metadata around/inside around them.
1375     self.dumpfile.write('%16d' % (length + props_len))
1376     # Jump back to the end of the stream
1377     self.dumpfile.seek(0, 2)
1378
1379     # This record is done (write two newlines -- one to terminate
1380     # contents that weren't themselves newline-termination, one to
1381     # provide a blank line for readability.
1382     self.dumpfile.write('\n\n')
1383     return change.closed_tags, change.closed_branches
1384
1385   def delete_path(self, svn_path, tags, branches, prune=None):
1386     """If SVN_PATH exists in the head mirror, output the deletion to
1387     the dumpfile, else output nothing to the dumpfile.
1388
1389     Return a tuple (path_deleted, closed_tags, closed_branches), where
1390     path_deleted is the path deleted if any or None if no deletion was
1391     necessary, and closed_tags and closed_names are lists of symbolic
1392     names closed off by this deletion -- that is, tags or branches
1393     which could be rooted in the previous revision of PATH, but not in
1394     this revision, because this rev changes PATH.  If path_deleted is
1395     None, then closed_tags and closed_branches will both be empty.
1396
1397     Iff PRUNE is true, then the path deleted can be not None, yet
1398     shorter than SVN_PATH because of pruning."""
1399     deleted_path, closed_tags, closed_branches \
1400                   = self.repos_mirror.delete_path(svn_path, tags,
1401                                                   branches, prune)
1402     if deleted_path:
1403       print "    (deleted '%s')" % deleted_path
1404       self.dumpfile.write('Node-path: %s\n'
1405                           'Node-action: delete\n'
1406                           '\n' % self.utf8_path(deleted_path))
1407     return deleted_path, closed_tags, closed_branches
1408
1409   def close(self):
1410     self.repos_mirror.close()
1411
1412     # If we're only making a dumpfile, we should be done now.  Just
1413     # close the dumpfile.  Otherwise, we're in "incremental" mode, and
1414     # we need to close our incremental dumpfile, flush it to the
1415     # repository, and then remove it.
1416     if self.dump_only:
1417       self.dumpfile.close()
1418     else:
1419       self.flush_and_remove_dumpfile()
1420       ret = self.loader_pipe.close()
1421       if ret:
1422         sys.stderr.write('%s: svnadmin load exited with error code %s' %
1423             (error_prefix, ret))
1424         sys.exit(1)
1425
1426
1427 def format_date(date):
1428   """Return an svn-compatible date string for DATE (seconds since epoch)."""
1429   # A Subversion date looks like "2002-09-29T14:44:59.000000Z"
1430   return time.strftime("%Y-%m-%dT%H:%M:%S.000000Z", time.gmtime(date))
1431
1432
1433 def make_revision_props(ctx, symbolic_name, is_tag, date=None):
1434   """Return a dictionary of revision properties for the manufactured
1435   commit that finished SYMBOLIC_NAME.  If IS_TAG is true, write the
1436   log message as though for a tag, else as though for a branch.
1437   If DATE is passed, use it as the value of the svn:date property."""
1438   if is_tag:
1439     type = 'tag'
1440   else:
1441     type = 'branch'
1442
1443   # In Python 2.2.3, we could use textwrap.fill().  Oh well :-).
1444   if len(symbolic_name) >= 13:
1445     space_or_newline = '\n'
1446   else:
1447     space_or_newline = ' '
1448
1449   log = "This commit was manufactured by cvs2svn to create %s%s'%s'." \
1450         % (type, space_or_newline, symbolic_name)
1451
1452   return { 'svn:author' : ctx.username,
1453            'svn:log' : log,
1454            'svn:date' : date or format_date(time.time())}
1455
1456
1457 class SymbolicNameTracker:
1458   """Track the Subversion path/revision ranges of CVS symbolic names.
1459   This is done in a .db file, representing a tree in the usual way.
1460   In addition to directory entries, each object in the database stores
1461   the earliest revision from which it could be copied, and the first
1462   revision from which it could no longer be copied.  Intermediate
1463   directories go one step farther: they record counts for the various
1464   revisions from which items under them could have been copied, and
1465   counts for the cutoff revisions.  For example:
1466
1467                                .----------.
1468                                |  sub1    | [(2, 1), (3, 3)]
1469                                |  /       | [(5, 1), (17, 2), (50, 1)]
1470                                | /        |
1471                                |/ sub2    |
1472                                /    \     |
1473                               /|_____\____|
1474                              /        \
1475                       ______/          \_________
1476                      /                           \
1477                     /                             \
1478                    /                               \
1479               .---------.                     .---------.
1480               |  file1  |                     |  file3  |
1481               |   /     | [(3, 2)]            |     \   | [(2, 1), (3, 1)]
1482               |  /      | [(17, 1), (50, 1)]  |      \  | [(5, 1), (10, 1)]
1483               | /       |                     |       \ |
1484               |/ file2  |                     |  file4 \|
1485               /    \    |                     |    /    \
1486              /|_____\___|                     |___/_____|\
1487             /        \                           /        \
1488            /          \                         /          \
1489           /            \                       /            \
1490          /              +                     /              +
1491     +======+            |                 +======+           |
1492     |      | [(3, 1)]   |                 |      | [(2, 1)]  |
1493     |      | [(17, 1)]  |                 |      | [(5, 1)]  |
1494     |      |            |                 |      |           |
1495     +======+            |                 +======+           |
1496                     +======+                             +======+
1497                     |      | [(3, 1)]                    |      | [(3, 1)]
1498                     |      | [(50, 1)]                   |      | [(17, 1)]
1499                     |      |                             |      |
1500                     +======+                             +======+
1501
1502   The two lists to the right of each node represent the 'opening' and
1503   'closing' revisions respectively.  Each tuple in a list is of the
1504   form (REV, COUNT).  For leaf nodes, COUNT is always 1, of course.
1505   For intermediate nodes, the counts are the sums of the corresponding
1506   counts of child nodes.
1507
1508   These revision scores are used to determine the optimal copy
1509   revisions for each tree/subtree at branch or tag creation time.
1510
1511   The svn path input will most often be a trunk path, because the
1512   path/rev information recorded here is about where and when the given
1513   symbolic name could be rooted, *not* a path/rev for which commits
1514   along that symbolic name take place (of course, commits only happen on
1515   branches anyway)."""
1516
1517   def __init__(self):
1518     self.db_file = SYMBOLIC_NAMES_DB
1519     self.db = Database(self.db_file, 'n')
1520     self.root_key = gen_key()
1521     self.db[self.root_key] = {}
1522
1523     # The keys for the opening and closing revision lists attached to
1524     # each directory or file.  Includes "/" so as never to conflict
1525     # with any real entry.
1526     self.tags_opening_revs_key = "/tag-openings"
1527     self.tags_closing_revs_key = "/tag-closings"
1528     self.br_opening_revs_key   = "/br-openings"
1529     self.br_closing_revs_key   = "/br-closings"
1530
1531     # When a node is copied into the repository, the revision copied
1532     # is stored under the appropriate key, and the corresponding
1533     # opening and closing rev lists are removed.
1534     self.tags_copyfrom_rev_key = "/tags-copyfrom-rev"
1535     self.br_copyfrom_rev_key = "/br-copyfrom-rev"
1536
1537   def probe_path(self, symbolic_name, path, debugging=None):
1538     """If 'SYMBOLIC_NAME/PATH' exists in the symbolic name tree,
1539     return the value of its last component, else return None.
1540     PATH may be None, but may not start with '/'.
1541     If DEBUGGING is true, then print trace output to stdout."""
1542     if path:
1543       components = [symbolic_name] + string.split(path, '/')
1544     else:
1545       components = [symbolic_name]
1546
1547     if debugging:
1548       print "PROBING SYMBOLIC NAME:\n", components
1549
1550     parent_key = self.root_key
1551     parent = self.db[parent_key]
1552     last_component = "/"
1553     i = 1
1554     for component in components:
1555       if debugging:
1556         print "  " * i,
1557         print "'%s' key: %s, val:" % (last_component, parent_key), parent
1558
1559       # Check for a "can't happen."
1560       if not parent.has_key(component):
1561         sys.stderr.write("%s: sym probe failed: '%s' does not contain '%s'\n"
1562                          % (error_prefix, last_component, component))
1563         sys.exit(1)
1564
1565       this_entry_key = parent[component]
1566       this_entry_val = self.db[this_entry_key]
1567       parent_key = this_entry_key
1568       parent = this_entry_val
1569       last_component = component
1570       i = i + 1
1571
1572     if debugging:
1573       print "  " * i,
1574       print "parent_key: %s, val:" % parent_key, parent
1575
1576     # It's not actually a parent at this point, it's the leaf node.
1577     return parent
1578
1579   def bump_rev_count(self, item_key, rev, revlist_key):
1580     """Increment REV's count in opening or closing list under KEY.
1581     REVLIST_KEY is self.*_opening_revs_key or self.*_closing_revs_key,
1582     and indicates which rev list to increment REV's count in.
1583
1584     For example, if REV is 7, REVLIST_KEY is
1585     self.tags_opening_revs_key, and the entry's tags opening revs list
1586     looks like this
1587
1588          [(2, 5), (7, 2), (10, 15)]
1589
1590     then afterwards it would look like this:
1591
1592          [(2, 5), (7, 3), (10, 15)]
1593
1594     But if no tuple for revision 7 were present, then one would be
1595     added, for example
1596
1597          [(2, 5), (10, 15)]
1598
1599     would become
1600
1601          [(2, 5), (7, 1), (10, 15)]
1602
1603     The list is sorted by ascending revision both before and after."""
1604
1605     entry_val = self.db[item_key]
1606
1607     if not entry_val.has_key(revlist_key):
1608       entry_val[revlist_key] = [(rev, 1)]
1609     else:
1610       rev_counts = entry_val[revlist_key]
1611       for i in range(len(rev_counts)):
1612         this_rev, this_count = rev_counts[i]
1613         if rev == this_rev:
1614           rev_counts[i] = (this_rev, this_count + 1)
1615           break
1616         elif this_rev > rev:
1617           if i > 0:
1618             i = i - 1
1619           rev_counts.insert(i, (rev, 1))
1620           break
1621       else:
1622         rev_counts.append((rev, 1))
1623       entry_val[revlist_key] = rev_counts
1624
1625     self.db[item_key] = entry_val
1626
1627   # The verb form of "root" is "root", but that would be misleading in
1628   # this case; and the opposite of "uproot" is presumably "downroot",
1629   # but that wouldn't exactly clarify either.  Hence, "enroot" :-).
1630   def enroot_names(self, svn_path, svn_rev, names, opening_key):
1631     """Record SVN_PATH at SVN_REV as the earliest point from which the
1632     symbolic names in NAMES could be copied.  OPENING_KEY is
1633     self.tags_opening_revs_key or self.br_opening_revs_key, to
1634     indicate whether NAMES contains tag names or branch names.
1635     SVN_PATH does not start with '/'."""
1636
1637     # Guard against names == None
1638     if not names:
1639       return
1640
1641     for name in names:
1642       components = [name] + string.split(svn_path, '/')
1643       parent_key = self.root_key
1644       for component in components:
1645         self.bump_rev_count(parent_key, svn_rev, opening_key)
1646         parent = self.db[parent_key]
1647         if not parent.has_key(component):
1648           new_child_key = gen_key()
1649           parent[component] = new_child_key
1650           self.db[new_child_key] = {}
1651           self.db[parent_key] = parent
1652         # One way or another, parent now has an entry for component.
1653         this_entry_key = parent[component]
1654         this_entry_val = self.db[this_entry_key]
1655         # Swaparoo.
1656         parent_key = this_entry_key
1657         parent = this_entry_val
1658
1659       self.bump_rev_count(parent_key, svn_rev, opening_key)
1660
1661   def enroot_tags(self, svn_path, svn_rev, tags):
1662     """Record SVN_PATH at SVN_REV as the earliest point from which the
1663     symbolic names in TAGS could be copied.  SVN_PATH does not start
1664     with '/'."""
1665     self.enroot_names(svn_path, svn_rev, tags, self.tags_opening_revs_key)
1666
1667   def enroot_branches(self, svn_path, svn_rev, branches):
1668     """Record SVN_PATH at SVN_REV as the earliest point from which the
1669     symbolic names in BRANCHES could be copied.  SVN_PATH does not
1670     start with '/'."""
1671     self.enroot_names(svn_path, svn_rev, branches, self.br_opening_revs_key)
1672
1673   def close_names(self, svn_path, svn_rev, names, closing_key):
1674     """Record that as of SVN_REV, SVN_PATH could no longer be the
1675     source from which any of symbolic names in NAMES could be copied.
1676     CLOSING_KEY is self.tags_closing_revs_key or
1677     self.br_closing_revs_key, to indicate whether NAMES are tags or
1678     branches.  SVN_PATH does not start with '/'."""
1679
1680     # Guard against names == None
1681     if not names:
1682       return
1683
1684     for name in names:
1685       components = [name] + string.split(svn_path, '/')
1686       parent_key = self.root_key
1687       for component in components:
1688         self.bump_rev_count(parent_key, svn_rev, closing_key)
1689         parent = self.db[parent_key]
1690         # Check for a "can't happen".
1691         if not parent.has_key(component):
1692           sys.stderr.write("%s: in path '%s', value for parent key '%s' "
1693                            "does not have entry '%s'\n"
1694                            % (error_prefix, svn_path, parent_key, component))
1695           sys.exit(1)
1696         this_entry_key = parent[component]
1697         this_entry_val = self.db[this_entry_key]
1698         # Swaparoo.
1699         parent_key = this_entry_key
1700         parent = this_entry_val
1701
1702       self.bump_rev_count(parent_key, svn_rev, closing_key)
1703
1704   def close_tags(self, svn_path, svn_rev, tags):
1705     """Record that as of SVN_REV, SVN_PATH could no longer be the
1706     source from which any of TAGS could be copied.  SVN_PATH does not
1707     start with '/'."""
1708     self.close_names(svn_path, svn_rev, tags, self.tags_closing_revs_key)
1709
1710   def close_branches(self, svn_path, svn_rev, branches):
1711     """Record that as of SVN_REV, SVN_PATH could no longer be the
1712     source from which any of BRANCHES could be copied.  SVN_PATH does
1713     not start with '/'."""
1714     self.close_names(svn_path, svn_rev, branches, self.br_closing_revs_key)
1715
1716   def score_revisions(self, openings, closings):
1717     """Return a list of revisions and scores based on OPENINGS and
1718     CLOSINGS.  The returned list looks like:
1719
1720        [(REV1 SCORE1), (REV2 SCORE2), ...]
1721
1722     where REV2 > REV1.  OPENINGS and CLOSINGS are the values of
1723     self.tags_opening_revs_key and self.tags_closing_revs_key, or
1724     self.br_opening_revs_key and self.br_closing_revs_key, from some file or
1725     directory node, or else None.
1726
1727     Each score indicates that copying the corresponding revision (or any
1728     following revision up to the next revision in the list) of
1729     the object in question would yield that many correct paths at or
1730     underneath the object.  There may be other paths underneath it
1731     which are not correct and need to be deleted or recopied; those
1732     can only be detected by descending and examining their scores.
1733
1734     If OPENINGS is false, return the empty list."""
1735
1736     # First look for easy outs.
1737     if not openings:
1738       return []
1739
1740     # Must be able to call len(closings) below.
1741     if closings is None:
1742       closings = []
1743
1744     # No easy out, so wish for lexical closures and calculate the scores :-).
1745     scores = []
1746     opening_score_accum = 0
1747     for i in range(len(openings)):
1748       opening_rev, opening_score = openings[i]
1749       opening_score_accum = opening_score_accum + opening_score
1750       scores.append((opening_rev, opening_score_accum))
1751     min = 0
1752     for i in range(len(closings)):
1753       closing_rev, closing_score = closings[i]
1754       done_exact_rev = None
1755       insert_index = None
1756       insert_score = None
1757       for j in range(min, len(scores)):
1758         score_rev, score = scores[j]
1759         if score_rev >= closing_rev:
1760           if not done_exact_rev:
1761             if score_rev > closing_rev:
1762               insert_index = j
1763               insert_score = scores[j-1][1] - closing_score
1764             done_exact_rev = 1
1765           scores[j] = (score_rev, score - closing_score)
1766         else:
1767           min = j + 1
1768       if not done_exact_rev:
1769         scores.append((closing_rev,scores[-1][1] - closing_score))
1770       if insert_index is not None:
1771         scores.insert(insert_index, (closing_rev, insert_score))
1772     return scores
1773
1774   def best_rev(self, scores, prefer_rev, limit_rev):
1775     """Return the revision older than LIMIT_REV with the highest score
1776     from SCORES, a list returned by score_revisions(). When the maximum score
1777     is shared by multiple revisions, the oldest revision is selected, unless
1778     PREFER_REV is one of the possibilities, in which case, it is selected."""
1779     max_score = 0
1780     prefer_rev_score = -1
1781     rev = SVN_INVALID_REVNUM
1782     for pair in scores:
1783       if pair[1] > max_score and pair[0] < limit_rev:
1784         max_score = pair[1]
1785         rev = pair[0]
1786       if pair[0] <= prefer_rev:
1787         prefer_rev_score = pair[1]
1788     if prefer_rev_score == max_score:
1789       rev = prefer_rev
1790     return rev
1791
1792   def is_best_rev(self, scores, rev, limit_rev):
1793     """Return true if REV has the highest score for revisions older than
1794     LIMIT_REV from SCORES, a list returned by score_revisions()."""
1795     return self.best_rev(scores, rev, limit_rev) == rev
1796
1797   # Helper for copy_descend().
1798   def cleanup_entries(self, rev, limit_rev, entries, is_tag):
1799     """Return a copy of ENTRIES, minus the individual entries whose
1800     highest scoring revision doesn't match REV (and also, minus and
1801     special '/'-denoted flags).  IS_TAG is 1 or None, based on whether
1802     this work is being done for the sake of a tag or a branch."""
1803     if is_tag:
1804       opening_key = self.tags_opening_revs_key
1805       closing_key = self.tags_closing_revs_key
1806     else:
1807       opening_key = self.br_opening_revs_key
1808       closing_key = self.br_closing_revs_key
1809
1810     new_entries = {}
1811     for key in entries.keys():
1812       if key[0] == '/': # Skip flags
1813         continue
1814       entry = entries.get(key)
1815       val = self.db[entry]
1816       scores = self.score_revisions(val.get(opening_key), val.get(closing_key))
1817       if self.is_best_rev(scores, rev, limit_rev):
1818         new_entries[key] = entry
1819     return new_entries
1820
1821   # Helper for fill_branch().
1822   def copy_descend(self, dumper, ctx, name, parent, entry_name,
1823                    parent_rev, src_path, dst_path, is_tag, jit_new_rev=None):
1824     """Starting with ENTRY_NAME in directory object PARENT at
1825     PARENT_REV, use DUMPER and CTX to copy nodes in the Subversion
1826     repository, manufacturing the source paths with SRC_PATH and the
1827     destination paths with NAME and DST_PATH.
1828
1829     If IS_TAG is true, NAME is treated as a tag, else as a branch.
1830
1831     If JIT_NEW_REV is not None, it is a list of one or two elements.
1832     If the first element is true, then if any copies are to be made,
1833     invoke DUMPER.start_revision() before the first copy, then set
1834     JIT_NEW_REV[0] to None, so no more new revisions are made for this
1835     symbolic name anywhere in this descent.
1836
1837     The second element, if present, is the string to be used for the svn:date
1838     property of any JIT-created revision.
1839
1840     ('JIT' == 'Just In Time'.)"""
1841     ### Hmmm, is passing [1] instead of 1 an idiomatic way of passing
1842     ### a side-effectable boolean in Python?  That's how the
1843     ### JIT_NEW_REV parameter works here and elsewhere, but maybe
1844     ### there's a clearer way to do it?
1845
1846     key = parent[entry_name]
1847     val = self.db[key]
1848
1849     if is_tag:
1850       opening_key = self.tags_opening_revs_key
1851       closing_key = self.tags_closing_revs_key
1852       copyfrom_rev_key = self.tags_copyfrom_rev_key
1853     else:
1854       opening_key = self.br_opening_revs_key
1855       closing_key = self.br_closing_revs_key
1856       copyfrom_rev_key = self.br_copyfrom_rev_key
1857
1858     limit_rev = dumper.revision
1859     if jit_new_rev and jit_new_rev[0]:
1860       # Because in this case the current rev is complete,
1861       # so is a valid copyfrom source
1862       limit_rev = limit_rev + 1
1863
1864     if not val.has_key(copyfrom_rev_key):
1865       # If not already copied this subdir, calculate its "best rev"
1866       # and see if it differs from parent's best rev.
1867       scores = self.score_revisions(val.get(opening_key), val.get(closing_key))
1868       rev = self.best_rev(scores, parent_rev, limit_rev)
1869
1870       if rev == SVN_INVALID_REVNUM:
1871         return  # name is a branch, but we're doing a tag, or vice versa
1872
1873       else:
1874         if is_tag:
1875           copy_dst = make_path(ctx, dst_path, None, name)
1876         else:
1877           copy_dst = make_path(ctx, dst_path, name, None)
1878
1879         expected_entries = self.cleanup_entries(rev, limit_rev,
1880                                                 val, is_tag)
1881         if (rev != parent_rev):
1882           if jit_new_rev and jit_new_rev[0]:
1883             dumper.start_revision(make_revision_props(ctx, name, is_tag,
1884               len(jit_new_rev) > 1 and jit_new_rev[1] or None))
1885             jit_new_rev[0] = None
1886           if dumper.copy_path(src_path, rev, copy_dst, expected_entries):
1887             parent_rev = rev
1888           else:
1889             # If we didn't copy, then we need to prune
1890             dumper.prune_entries(copy_dst, expected_entries)
1891         else:
1892           # Even if we kept the already-present revision of this entry
1893           # instead of copying a new one, we still need to prune out
1894           # anything that's not part of the symbolic name.
1895           dumper.prune_entries(copy_dst, expected_entries)
1896
1897         # Record that this copy is done:
1898         val[copyfrom_rev_key] = parent_rev
1899         if val.has_key(opening_key):
1900           del val[opening_key]
1901         if val.has_key(closing_key):
1902           del val[closing_key]
1903         self.db[key] = val
1904
1905     for ent in val.keys():
1906       if not ent[0] == '/':
1907         if src_path:
1908           next_src = src_path + '/' + ent
1909         else:
1910           next_src = ent
1911         if dst_path:
1912           next_dst = dst_path + '/' + ent
1913         else:
1914           next_dst = ent
1915         self.copy_descend(dumper, ctx, name, val, ent, parent_rev,
1916                           next_src, next_dst, is_tag, jit_new_rev)
1917
1918   def fill_name(self, dumper, ctx, name, is_tag, jit_new_rev=None):
1919     """Use DUMPER to create all currently available parts of symbolic
1920     name NAME that have not been created already.
1921
1922     If IS_TAG is true, NAME is treated as a tag, else as a branch.
1923
1924     JIT_NEW_REV is as documented for the copy_descend() function."""
1925
1926     # A source path looks like this in the symbolic name tree:
1927     #
1928     #    thisbranch/trunk/proj/foo/bar/baz.c
1929     #
1930     # ...or occasionally...
1931     #
1932     #    thisbranch/branches/sourcebranch/proj/foo/bar/baz.c
1933     #
1934     # (the latter when 'thisbranch' is branched off 'sourcebranch').
1935     #
1936     # Meanwhile, we're copying to a location in the repository like
1937     #
1938     #    /branches/thisbranch/proj/foo/bar/baz.c    or
1939     #    /tags/tagname/proj/foo/bar/baz.c
1940     #
1941     # Of course all this depends on make_path()'s behavior.  At
1942     # various times we've changed the way it produces paths (see
1943     # revisions 6028 and 6347).  If it changes again, the logic here
1944     # must be adjusted to match.
1945
1946     parent_key = self.root_key
1947     parent = self.db[parent_key]
1948
1949     # If there are no origin records, then we must've messed up earlier.
1950     if not parent.has_key(name):
1951       if is_tag:
1952         sys.stderr.write("%s: no origin records for tag '%s'.\n"
1953                          % (error_prefix, name))
1954       else:
1955         sys.stderr.write("%s: no origin records for branch '%s'.\n"
1956                          % (error_prefix, name))
1957       sys.exit(1)
1958
1959     parent_key = parent[name]
1960     parent = self.db[parent_key]
1961
1962     # All Subversion source paths under the branch start with one of
1963     # three things:
1964     #
1965     #   /trunk/...
1966     #   /branches/foo/...
1967     #   /tags/foo/...
1968     #
1969     # (We don't care what foo is, it's just a component to skip over.)
1970     #
1971     # Since these don't all have the same number of components, we
1972     # manually descend into each as far as necessary, then invoke
1973     # copy_descend() once we're in the right place in both trees.
1974     #
1975     # Since it's possible for a branch or tag to have some source
1976     # paths on trunk and some on branches, there's some question about
1977     # what to copy as the top-level directory of the branch.  Our
1978     # solution is to [somewhat randomly] give preference to trunk.
1979     # Note that none of these paths can ever conflict; for example,
1980     # it would be impossible to have both
1981     #
1982     #   thisbranch/trunk/myproj/lib/drivers.c                   and
1983     #   thisbranch/branches/sourcebranch/myproj/lib/drivers.c
1984     #
1985     # because that would imply that the symbolic name 'thisbranch'
1986     # appeared twice in the RCS file header, referring to two
1987     # different revisions.  Well, I suppose that's *possible*, but its
1988     # effect is undefined, and it's as reasonable for us to just
1989     # overwrite one with the other as anything else -- anyway, isn't
1990     # that what CVS would do if you checked out the branch?  <shrug>
1991
1992     if parent.has_key(ctx.trunk_base):
1993       self.copy_descend(dumper, ctx, name, parent, ctx.trunk_base,
1994                         SVN_INVALID_REVNUM, ctx.trunk_base, "",
1995                         is_tag, jit_new_rev)
1996     if parent.has_key(ctx.branches_base):
1997       branch_base_key = parent[ctx.branches_base]
1998       branch_base = self.db[branch_base_key]
1999       for this_source in branch_base.keys():
2000         # We skip special names beginning with '/' for the usual
2001         # reason.  We skip cases where (this_source == name) for a
2002         # different reason: if a CVS branch were rooted in itself,
2003         # that would imply that the same symbolic name appeared on two
2004         # different branches in an RCS file, which CVS doesn't
2005         # permit.  So while it wouldn't hurt to descend, it would be a
2006         # waste of time.
2007         if (this_source[0] != '/') and (this_source != name):
2008           src_path = ctx.branches_base + '/' + this_source
2009           self.copy_descend(dumper, ctx, name, branch_base, this_source,
2010                             SVN_INVALID_REVNUM, src_path, "",
2011                             is_tag, jit_new_rev)
2012
2013   def fill_tag(self, dumper, ctx, tag, jit_new_rev=None):
2014     """Use DUMPER to create all currently available parts of TAG that
2015     have not been created already.  Use CTX.trunk_base, CTX.tags_base,
2016     and CTX.branches_base to determine the source and destination
2017     paths in the Subversion repository.
2018
2019     JIT_NEW_REV is as documented for the copy_descend() function."""
2020     self.fill_name(dumper, ctx, tag, 1, jit_new_rev)
2021
2022   def fill_branch(self, dumper, ctx, branch, jit_new_rev=None):
2023     """Use DUMPER to create all currently available parts of BRANCH that
2024     haven't been created already.  Use CTX.trunk_base, CTX.tags_base,
2025     and CTX.branches_base to determine the source and destination
2026     paths in the Subversion repository.
2027
2028     JIT_NEW_REV is as documented for the copy_descend() function."""
2029     self.fill_name(dumper, ctx, branch, None, jit_new_rev)
2030
2031   def finish(self, dumper, ctx):
2032     """Use DUMPER to finish branches and tags that have either
2033     not been created yet, or have been only partially created.
2034     Use CTX.trunk_base, CTX.tags_base, and CTX.branches_base to
2035     determine the source and destination paths in the Subversion
2036     repository."""
2037     parent_key = self.root_key
2038     parent = self.db[parent_key]
2039     # Do all branches first, then all tags.  We don't bother to check
2040     # here whether a given name is a branch or a tag, or is done
2041     # already; the fill_foo() methods will just do nothing if there's
2042     # nothing to do.
2043     #
2044     # We do one revision per branch or tag, for clarity to users, not
2045     # for correctness.  In CVS, when you make a branch off a branch,
2046     # the new branch will just root itself in the roots of the old
2047     # branch *except* where the new branch sprouts from a revision
2048     # that was actually committed on the old branch.  In the former
2049     # cases, the source paths will be the same as the source paths
2050     # from which the old branch was created and therefore will already
2051     # exist; and in the latter case, the source paths will actually be
2052     # on the old branch, but those paths will exist already because
2053     # they were commits on that branch and therefore cvs2svn must have
2054     # created it already (see the fill_branch call in Commit.commit).
2055     # So either way, the source paths exist by the time we need them.
2056     #
2057     ### It wouldn't be so awfully hard to determine whether a name is
2058     ### just a branch or just a tag, which would allow for more
2059     ### intuitive messages below.
2060     if not ctx.trunk_only:
2061       print "Finishing branches:"
2062       for name in parent.keys():
2063         if name[0] != '/':
2064           print "finishing '%s' as branch" % name
2065           self.fill_branch(dumper, ctx, name, [1])
2066       print "Finishing tags:"
2067       for name in parent.keys():
2068         if name[0] != '/':
2069           print "finishing '%s' as tag" % name
2070           self.fill_tag(dumper, ctx, name, [1])
2071
2072
2073 def is_trunk_vendor_revision(default_branches_db, cvs_path, cvs_rev):
2074   """Return 1 if CVS_REV of CVS_PATH is a trunk (i.e., head) vendor
2075   revision according to DEFAULT_BRANCHES_DB, else return None."""
2076   if default_branches_db.has_key(cvs_path):
2077     val = default_branches_db[cvs_path]
2078     val_last_dot = val.rindex(".")
2079     received_last_dot = cvs_rev.rindex(".")
2080     default_branch = val[:val_last_dot]
2081     received_branch = cvs_rev[:received_last_dot]
2082     default_rev_component = int(val[val_last_dot + 1:])
2083     received_rev_component = int(cvs_rev[received_last_dot + 1:])
2084     if (default_branch == received_branch
2085         and received_rev_component <= default_rev_component):
2086       return 1
2087   # else
2088   return None
2089
2090
2091 class Commit:
2092   def __init__(self, author, log):
2093     self.author = author
2094     self.log = log
2095
2096     self.files = { }
2097
2098     # For consistency, the elements of both lists are of the form
2099     #
2100     #   (file, rev, deltatext_code, branch_name, tags, branches)
2101     #
2102     # even though self.deletes doesn't use the deltatext_code.
2103     self.changes = [ ]
2104     self.deletes = [ ]
2105
2106     # Start out with a t_min higher than any incoming time T, and a
2107     # t_max lower than any incoming T.  This way the first T will
2108     # push t_min down to T, and t_max up to T, naturally (without any
2109     # special-casing), and successive times will then ratchet them
2110     # outward as appropriate.
2111     self.t_min = 1L<<32
2112     self.t_max = 0
2113
2114   def has_file(self, fname):
2115     return self.files.has_key(fname)
2116
2117   def add(self, t, op, file, rev, deltatext_code, branch_name, tags, branches):
2118     # Record the time range of this commit.
2119     #
2120     # ### ISSUE: It's possible, though unlikely, that the time range
2121     # of a commit could get gradually expanded to be arbitrarily
2122     # longer than COMMIT_THRESHOLD.  I'm not sure this is a huge
2123     # problem, and anyway deciding where to break it up would be a
2124     # judgement call. For now, we just print a warning in commit() if
2125     # this happens.
2126     if t < self.t_min:
2127       self.t_min = t
2128     if t > self.t_max:
2129       self.t_max = t
2130
2131     if op == OP_CHANGE:
2132       self.changes.append((file, rev, deltatext_code, branch_name,
2133                            tags, branches))
2134     else:
2135       # OP_DELETE
2136       self.deletes.append((file, rev, deltatext_code, branch_name,
2137                            tags, branches))
2138     self.files[file] = 1
2139
2140   def commit(self, dumper, ctx, sym_tracker):
2141     # commit this transaction
2142     seconds = self.t_max - self.t_min
2143     print 'committing: %s, over %d seconds' % (time.ctime(self.t_min), seconds)
2144     if seconds > COMMIT_THRESHOLD:
2145       print '%s: commit spans more than %d seconds' \
2146             % (warning_prefix, COMMIT_THRESHOLD)
2147
2148     if ctx.dry_run:
2149       for f, r, dt_code, br, tags, branches in self.changes:
2150         # compute a repository path, dropping the ,v from the file name
2151         svn_path = make_path(ctx, relative_name(ctx.cvsroot, f[:-2]), br)
2152         print "    adding or changing '%s' : '%s'" % (r, svn_path)
2153       for f, r, dt_code, br, tags, branches in self.deletes:
2154         # compute a repository path, dropping the ,v from the file name
2155         svn_path = make_path(ctx, relative_name(ctx.cvsroot, f[:-2]), br)
2156         print "    deleting '%s' : '%s'" % (r, svn_path)
2157       print '    (skipped; dry run enabled)'
2158       return
2159
2160     do_copies = [ ]
2161
2162     # State for handling default branches.
2163     #
2164     # Here is a tempting, but ultimately nugatory, bit of logic, which
2165     # I share with you so you may appreciate the less attractive, but
2166     # refreshingly non-nugatory, logic which follows it:
2167     #
2168     # If some of the commits in this txn happened on a non-trunk
2169     # default branch, then those files will have to be copied into
2170     # trunk manually after being changed on the branch (because the
2171     # RCS "default branch" appears as head, i.e., trunk, in practice).
2172     # As long as those copies don't overwrite any trunk paths that
2173     # were also changed in this commit, then we can do the copies in
2174     # the same revision, because they won't cover changes that don't
2175     # appear anywhere/anywhen else.  However, if some of the trunk dst
2176     # paths *did* change in this commit, then immediately copying the
2177     # branch changes would lose those trunk mods forever.  So in this
2178     # case, we need to do at least that copy in its own revision.  And
2179     # for simplicity's sake, if we're creating the new revision for
2180     # even one file, then we just do all such copies together in the
2181     # new revision.
2182     #
2183     # Doesn't that sound nice?
2184     #
2185     # Unfortunately, Subversion doesn't support copies with sources
2186     # in the current txn.  All copies must be based in committed
2187     # revisions.  Therefore, we generate the above-described new
2188     # revision unconditionally.
2189     #
2190     # Each of these is a list of tuples.  Each tuple is of the form:
2191     #
2192     #   (cvs_path, branch_name, tags_rooted_here, branches_rooted_here)
2193     #
2194     # and a tuple is created for each default branch commit that will
2195     # need to be copied to trunk (or deleted from trunk) in the
2196     # generated revision following the "regular" revision.
2197     default_branch_copies  = [ ]
2198     default_branch_deletes = [ ]
2199
2200     # we already have the date, so just format it
2201     date = format_date(self.t_max)
2202     try:
2203       ### FIXME: The 'replace' behavior should be an option, like
2204       ### --encoding is.
2205       unicode_author = unicode(self.author, ctx.encoding, 'replace')
2206       unicode_log = unicode(self.log, ctx.encoding, 'replace')
2207       props = { 'svn:author' : unicode_author.encode('utf8'),
2208                 'svn:log' : unicode_log.encode('utf8'),
2209                 'svn:date' : date }
2210     except UnicodeError:
2211       print '%s: problem encoding author or log message:' % warning_prefix
2212       print "  author: '%s'" % self.author
2213       print "  log:    '%s'" % self.log
2214       print "  date:   '%s'" % date
2215       for rcs_file, cvs_rev, dt_code, br, tags, branches in self.changes:
2216         print "    rev %s of '%s'" % (cvs_rev, rcs_file)
2217       print "Consider rerunning with (for example) '--encoding=latin1'."
2218       # Just fall back to the original data.
2219       props = { 'svn:author' : self.author,
2220                 'svn:log' : self.log,
2221                 'svn:date' : date }
2222
2223
2224     # Tells whether we actually wrote anything to the dumpfile.
2225     svn_rev = SVN_INVALID_REVNUM
2226
2227     # If any of the changes we are about to do are on branches, we need to
2228     # check and maybe fill them (in their own revisions) *before* we start
2229     # then data revision. So we have to iterate over changes and deletes twice.
2230     for rcs_file, cvs_rev, dt_code, br, tags, branches in self.changes:
2231       # compute a repository path, dropping the ,v from the file name
2232       cvs_path = relative_name(ctx.cvsroot, rcs_file[:-2])
2233       svn_path = make_path(ctx, cvs_path, br)
2234       if br:
2235         ### FIXME: Here is an obvious optimization point.  Probably
2236         ### dump.probe_path(PATH) is kind of slow, because it does N
2237         ### database lookups for the N components in PATH.  If this
2238         ### turns out to be a performance bottleneck, we can just
2239         ### maintain a database mirroring just the head tree, but
2240         ### keyed on full paths, to reduce the check to a quick
2241         ### constant time query.
2242         if not dumper.probe_path(svn_path):
2243           sym_tracker.fill_branch(dumper, ctx, br, [1, date])
2244
2245     for rcs_file, cvs_rev, dt_code, br, tags, branches in self.deletes:
2246       # compute a repository path, dropping the ,v from the file name
2247       cvs_path = relative_name(ctx.cvsroot, rcs_file[:-2])
2248       svn_path = make_path(ctx, cvs_path, br)
2249       if br:
2250         ### FIXME: Here is an obvious optimization point.  Probably
2251         ### dump.probe_path(PATH) is kind of slow, because it does N
2252         ### database lookups for the N components in PATH.  If this
2253         ### turns out to be a performance bottleneck, we can just
2254         ### maintain a database mirroring just the head tree, but
2255         ### keyed on full paths, to reduce the check to a quick
2256         ### constant time query.
2257         if not dumper.probe_path(svn_path):
2258           sym_tracker.fill_branch(dumper, ctx, br, [1, date])
2259
2260
2261     # Now that any branches we need exist, we can do the commits.
2262     for rcs_file, cvs_rev, dt_code, br, tags, branches in self.changes:
2263       # compute a repository path, dropping the ,v from the file name
2264       cvs_path = relative_name(ctx.cvsroot, rcs_file[:-2])
2265       svn_path = make_path(ctx, cvs_path, br)
2266       if svn_rev == SVN_INVALID_REVNUM:
2267         svn_rev = dumper.start_revision(props)
2268       sym_tracker.enroot_tags(svn_path, svn_rev, tags)
2269       sym_tracker.enroot_branches(svn_path, svn_rev, branches)
2270       print "    adding or changing %s : '%s'" % (cvs_rev, svn_path)
2271
2272       # Only make a change if we need to.  When 1.1.1.1 has an empty
2273       # deltatext, the explanation is almost always that we're looking
2274       # at an imported file whose 1.1 and 1.1.1.1 are identical.  On
2275       # such imports, CVS creates an RCS file where 1.1 has the
2276       # content, and 1.1.1.1 has an empty deltatext, i.e, the same
2277       # content as 1.1.  There's no reason to reflect this non-change
2278       # in the repository, so we want to do nothing in this case.  (If
2279       # we were really paranoid, we could make sure 1.1's log message
2280       # is the CVS-generated "Initial revision\n", but I think the
2281       # conditions below are strict enough.)
2282       if not ((dt_code == DELTATEXT_EMPTY) and (cvs_rev == "1.1.1.1")
2283               and dumper.probe_path(svn_path)):
2284         closed_tags, closed_branches = \
2285                      dumper.add_or_change_path(cvs_path,
2286                                                svn_path,
2287                                                cvs_rev,
2288                                                rcs_file,
2289                                                tags,
2290                                                branches,
2291                                                ctx.cvs_revnums)
2292         if is_trunk_vendor_revision(ctx.default_branches_db,
2293                                     cvs_path, cvs_rev):
2294           default_branch_copies.append((cvs_path, br, tags, branches))
2295         sym_tracker.close_tags(svn_path, svn_rev, closed_tags)
2296         sym_tracker.close_branches(svn_path, svn_rev, closed_branches)
2297
2298     for rcs_file, cvs_rev, dt_code, br, tags, branches in self.deletes:
2299       # compute a repository path, dropping the ,v from the file name
2300       cvs_path = relative_name(ctx.cvsroot, rcs_file[:-2])
2301       svn_path = make_path(ctx, cvs_path, br)
2302       print "    deleting %s : '%s'" % (cvs_rev, svn_path)
2303       if svn_rev == SVN_INVALID_REVNUM:
2304         svn_rev = dumper.start_revision(props)
2305       # Uh, can this even happen on a deleted path?  Hmmm.  If not,
2306       # there's no risk, since tags and branches would just be empty
2307       # and therefore enrooting would be a no-op.  Still, it would
2308       # be clearer to know for sure and simply not call it.
2309       sym_tracker.enroot_tags(svn_path, svn_rev, tags)
2310       sym_tracker.enroot_branches(svn_path, svn_rev, branches)
2311       ### FIXME: this will return path_deleted == None if no path
2312       ### was deleted.  But we'll already have started the revision
2313       ### by then, so it's a bit late to use the knowledge!  Need to
2314       ### reorganize things so that starting the revision is a
2315       ### callback with its own internal conditional, so anyone can
2316       ### just invoke when they know they're really about to do
2317       ### something.
2318       ###
2319       ### Right now what happens is we get an empty revision
2320       ### (assuming nothing else happened in this revision).
2321       path_deleted, closed_tags, closed_branches = \
2322                     dumper.delete_path(svn_path, tags, branches, ctx.prune)
2323       if is_trunk_vendor_revision(ctx.default_branches_db, cvs_path, cvs_rev):
2324         default_branch_deletes.append((cvs_path, br, tags, branches))
2325       sym_tracker.close_tags(svn_path, svn_rev, closed_tags)
2326       sym_tracker.close_branches(svn_path, svn_rev, closed_branches)
2327
2328     if svn_rev == SVN_INVALID_REVNUM:
2329       print '    no new revision created, as nothing to do'
2330     else:
2331       print '    new revision:', svn_rev
2332       if default_branch_copies or default_branch_deletes:
2333         previous_rev = svn_rev
2334         msg = 'This commit was generated by cvs2svn to compensate for '     \
2335               'changes in r%d,\n'                                           \
2336               'which included commits to RCS files with non-trunk default ' \
2337               'branches.\n' % previous_rev
2338         props = { 'svn:author' : 'cvs2svn',
2339                   'svn:log' : msg,
2340                   'svn:date' : date }
2341         svn_rev = dumper.start_revision(props)
2342
2343         for cvs_path, br, tags, branches in default_branch_copies:
2344           src_path = make_path(ctx, cvs_path, br)
2345           dst_path = make_path(ctx, cvs_path)
2346           if (dumper.probe_path(dst_path)):
2347             ign, closed_tags, closed_branches = \
2348                  dumper.delete_path(dst_path, tags, branches, ctx.prune)
2349             sym_tracker.close_tags(dst_path, svn_rev, closed_tags)
2350             sym_tracker.close_branches(dst_path, svn_rev, closed_branches)
2351           dumper.copy_path(src_path, previous_rev, dst_path)
2352
2353         for cvs_path, br, tags, branches in default_branch_deletes:
2354           # Ignore the branch -- we don't need to know the default
2355           # branch, we already know we're deleting this from trunk.
2356           dst_path = make_path(ctx, cvs_path)
2357           if (dumper.probe_path(dst_path)):
2358             ign, closed_tags, closed_branches = \
2359                  dumper.delete_path(dst_path, tags, branches, ctx.prune)
2360             sym_tracker.close_tags(dst_path, svn_rev, closed_tags)
2361             sym_tracker.close_branches(dst_path, svn_rev, closed_branches)
2362
2363
2364 def read_resync(fname):
2365   "Read the .resync file into memory."
2366
2367   ### note that we assume that we can hold the entire resync file in
2368   ### memory. really large repositories with whacky timestamps could
2369   ### bust this assumption. should that ever happen, then it is possible
2370   ### to split the resync file into pieces and make multiple passes,
2371   ### using each piece.
2372
2373   #
2374   # A digest maps to a sequence of lists which specify a lower and upper
2375   # time bound for matching up the commit. We keep a sequence of these
2376   # because a number of checkins with the same log message (e.g. an empty
2377   # log message) could need to be remapped. We also make them a list because
2378   # we will dynamically expand the lower/upper bound as we find commits
2379   # that fall into a particular msg and time range.
2380   #
2381   # resync == digest -> [ [old_time_lower, old_time_upper, new_time], ... ]
2382   #
2383   resync = { }
2384
2385   for line in fileinput.FileInput(fname):
2386     t1 = int(line[:8], 16)
2387     digest = line[9:DIGEST_END_IDX]
2388     t2 = int(line[DIGEST_END_IDX+1:], 16)
2389     t1_l = t1 - COMMIT_THRESHOLD/2
2390     t1_u = t1 + COMMIT_THRESHOLD/2
2391     if resync.has_key(digest):
2392       resync[digest].append([t1_l, t1_u, t2])
2393     else:
2394       resync[digest] = [ [t1_l, t1_u, t2] ]
2395
2396   # For each digest, sort the resync items in it in increasing order,
2397   # based on the lower time bound.
2398   digests = resync.keys()
2399   for digest in digests:
2400     (resync[digest]).sort()
2401
2402   return resync
2403
2404
2405 def parse_revs_line(line):
2406   data = line.split(' ', 7)
2407   timestamp = int(data[0], 16)
2408   id = data[1]
2409   op = data[2]
2410   rev = data[3]
2411   deltatext_code = data[4]
2412   branch_name = data[5]
2413   if branch_name == "*":
2414     branch_name = None
2415   ntags = int(data[6])
2416   tags = data[7].split(' ', ntags + 1)
2417   nbranches = int(tags[ntags])
2418   branches = tags[ntags + 1].split(' ', nbranches)
2419   fname = branches[nbranches][:-1]  # strip \n
2420   tags = tags[:ntags]
2421   branches = branches[:nbranches]
2422
2423   return timestamp, id, op, rev, deltatext_code, \
2424          fname, branch_name, tags, branches
2425
2426
2427 def write_revs_line(output, timestamp, digest, op, revision,
2428                     deltatext_code, fname, branch_name, tags, branches):
2429   output.write('%08lx %s %s %s %s ' % \
2430                (timestamp, digest, op, revision, deltatext_code))
2431   if not branch_name:
2432     branch_name = "*"
2433   output.write('%s ' % branch_name)
2434   output.write('%d ' % (len(tags)))
2435   for tag in tags:
2436     output.write('%s ' % (tag))
2437   output.write('%d ' % (len(branches)))
2438   for branch in branches:
2439     output.write('%s ' % (branch))
2440   output.write('%s\n' % fname)
2441
2442
2443 def pass1(ctx):
2444   cd = CollectData(ctx.cvsroot, DATAFILE, ctx.default_branches_db)
2445   p = rcsparse.Parser()
2446   stats = [ 0 ]
2447   os.path.walk(ctx.cvsroot, visit_file, (cd, p, stats))
2448   if ctx.verbose:
2449     print 'processed', stats[0], 'files'
2450   if len(cd.fatal_errors) > 0:
2451     sys.exit("Pass 1 complete.\n" + "=" * 75 + "\n"
2452              + "Error summary:\n"
2453              + "\n".join(cd.fatal_errors)
2454              + "\nExited due to fatal error(s).")
2455
2456 def pass2(ctx):
2457   "Pass 2: clean up the revision information."
2458
2459   # We may have recorded some changes in revisions' timestamp. We need to
2460   # scan for any other files which may have had the same log message and
2461   # occurred at "the same time" and change their timestamps, too.
2462
2463   # read the resync data file
2464   resync = read_resync(ctx.log_fname_base + RESYNC_SUFFIX)
2465
2466   output = open(ctx.log_fname_base + CLEAN_REVS_SUFFIX, 'w')
2467
2468   # process the revisions file, looking for items to clean up
2469   for line in fileinput.FileInput(ctx.log_fname_base + REVS_SUFFIX):
2470     timestamp, digest, op, rev, deltatext_code, fname, \
2471                branch_name, tags, branches = parse_revs_line(line)
2472     if not resync.has_key(digest):
2473       output.write(line)
2474       continue
2475
2476     # we have a hit. see if this is "near" any of the resync records we
2477     # have recorded for this digest [of the log message].
2478     for record in resync[digest]:
2479       if record[0] <= timestamp <= record[1]:
2480         # bingo! remap the time on this (record[2] is the new time).
2481         write_revs_line(output, record[2], digest, op, rev,
2482                         deltatext_code, fname, branch_name, tags, branches)
2483
2484         print "RESYNC: '%s' (%s) : old time='%s' new time='%s'" \
2485               % (relative_name(ctx.cvsroot, fname),
2486                  rev, time.ctime(timestamp), time.ctime(record[2]))
2487
2488         # adjust the time range. we want the COMMIT_THRESHOLD from the
2489         # bounds of the earlier/latest commit in this group.
2490         record[0] = min(record[0], timestamp - COMMIT_THRESHOLD/2)
2491         record[1] = max(record[1], timestamp + COMMIT_THRESHOLD/2)
2492
2493         # stop looking for hits
2494         break
2495     else:
2496       # the file/rev did not need to have its time changed.
2497       output.write(line)
2498
2499
2500 def pass3(ctx):
2501   # sort the log files
2502
2503   # GNU sort will sort our dates differently (incorrectly!) if our
2504   # LC_ALL is anything but 'C', so if LC_ALL is set, temporarily set
2505   # it to 'C'
2506   lc_all_tmp = os.getenv('LC_ALL')
2507   os.putenv('LC_ALL', 'C')
2508   run_command('sort %s > %s' % (ctx.log_fname_base + CLEAN_REVS_SUFFIX,
2509                                 ctx.log_fname_base + SORTED_REVS_SUFFIX))
2510   if lc_all_tmp is not None:
2511     os.putenv('LC_ALL', lc_all_tmp)
2512   else:
2513     os.unsetenv('LC_ALL')
2514
2515
2516 def pass4(ctx):
2517   sym_tracker = SymbolicNameTracker()
2518   metadata_db = Database(METADATA_DB, 'r')
2519
2520   # A dictionary of Commit objects, keyed by digest.  Each object
2521   # represents one logical commit, which may involve multiple files.
2522   #
2523   # The reason this is a dictionary, not a single object, is that
2524   # there may be multiple commits interleaved in time.  A commit can
2525   # span up to COMMIT_THRESHOLD seconds, which leaves plenty of time
2526   # for parts of some other commit to occur.  Since the s-revs file is
2527   # sorted by timestamp first, then by digest within each timestamp,
2528   # it's quite easy to have interleaved commits.
2529   commits = { }
2530
2531   # The total number of separate commits processed.  This is used only for
2532   # printing statistics, it does not affect the results in the repository.
2533   count = 0
2534
2535   # Start the dumpfile object.
2536   dumper = Dumper(ctx)
2537
2538   # process the logfiles, creating the target
2539   for line in fileinput.FileInput(ctx.log_fname_base + SORTED_REVS_SUFFIX):
2540     timestamp, id, op, rev, deltatext_code, fname, \
2541                branch_name, tags, branches = parse_revs_line(line)
2542
2543     if ctx.trunk_only and not trunk_rev.match(rev):
2544       ### note this could/should have caused a flush, but the next item
2545       ### will take care of that for us
2546       continue
2547
2548     # Each time we read a new line, we scan the commits we've
2549     # accumulated so far to see if any are ready for processing now.
2550     process = [ ]
2551     for scan_id, scan_c in commits.items():
2552       if scan_c.t_max + COMMIT_THRESHOLD < timestamp:
2553         process.append((scan_c.t_max, scan_c))
2554         del commits[scan_id]
2555         continue
2556       # If the inbound commit is on the same file as a pending commit,
2557       # close the pending commit to further changes. Don't flush it though,
2558       # as there may be other pending commits dated before this one.
2559       # ### ISSUE: the has_file() check below is not optimal.
2560       # It does fix the dataloss bug where revisions would get lost
2561       # if checked in too quickly, but it can also break apart the
2562       # commits. The correct fix would require tracking the dependencies
2563       # between change sets and committing them in proper order.
2564       if scan_c.has_file(fname):
2565         unused_id = scan_id + '-'
2566         while commits.has_key(unused_id):
2567           unused_id = unused_id + '-'
2568         commits[unused_id] = scan_c
2569         del commits[scan_id]
2570
2571     # If there are any elements in 'process' at this point, they need
2572     # to be committed, because this latest rev couldn't possibly be
2573     # part of any of them.  Sort them into time-order, then commit 'em.
2574     process.sort()
2575     for t_max, c in process:
2576       c.commit(dumper, ctx, sym_tracker)
2577     count = count + len(process)
2578
2579     # Add this item into the set of still-available commits.
2580     if commits.has_key(id):
2581       c = commits[id]
2582     else:
2583       author, log = metadata_db[id]
2584       c = commits[id] = Commit(author, log)
2585     c.add(timestamp, op, fname, rev, deltatext_code, branch_name,
2586           tags, branches)
2587
2588   # End of the sorted revs file.  Flush any remaining commits:
2589   if commits:
2590     process = [ ]
2591     for id, c in commits.items():
2592       process.append((c.t_max, c))
2593     process.sort()
2594     for t_max, c in process:
2595       c.commit(dumper, ctx, sym_tracker)
2596     count = count + len(process)
2597
2598   # Create (or complete) any branches and tags not already done.
2599   sym_tracker.finish(dumper, ctx)
2600
2601   dumper.close()
2602
2603   if ctx.verbose:
2604     print count, 'commits processed.'
2605
2606
2607 def pass5(ctx):
2608   if ctx.skip_cleanup:
2609     return
2610
2611   # Remove our database files
2612   os.unlink(SVN_REVISIONS_DB)
2613   os.unlink(NODES_DB)
2614   os.unlink(SYMBOLIC_NAME_ROOTS_DB)
2615   os.unlink(SYMBOLIC_NAMES_DB)
2616   os.unlink(METADATA_DB)
2617
2618   # This is the only DB reference still reachable at this point; lose
2619   # it before removing the file.
2620   ctx.default_branches_db = None
2621   os.unlink(DEFAULT_BRANCHES_DB)
2622
2623   # Remove our other data files
2624   for suffix in (REVS_SUFFIX, CLEAN_REVS_SUFFIX,
2625                  SORTED_REVS_SUFFIX, RESYNC_SUFFIX):
2626     os.unlink('cvs2svn-data' + suffix)
2627
2628
2629 _passes = [
2630   pass1,
2631   pass2,
2632   pass3,
2633   pass4,
2634   pass5,
2635   ]
2636
2637
2638 class _ctx:
2639   pass
2640
2641
2642 def convert(ctx, start_pass=1):
2643   "Convert a CVS repository to an SVN repository."
2644
2645   if not os.path.exists(ctx.cvsroot):
2646     sys.stderr.write(error_prefix + ': \'%s\' does not exist.\n' % ctx.cvsroot)
2647     sys.exit(1)
2648
2649   times = [ None ] * len(_passes)
2650   for i in range(start_pass - 1, len(_passes)):
2651     times[i] = time.time()
2652     print '----- pass %d -----' % (i + 1)
2653     _passes[i](ctx)
2654   times.append(time.time())
2655
2656   for i in range(start_pass, len(_passes)+1):
2657     print 'pass %d: %d seconds' % (i, int(times[i] - times[i-1]))
2658   print ' total:', int(times[len(_passes)] - times[start_pass-1]), 'seconds'
2659
2660
2661 def usage(ctx):
2662   print 'USAGE: %s [-n] [-v] [-s svn-repos-path] [-p pass] cvs-repos-path' \
2663         % os.path.basename(sys.argv[0])
2664   print '  --help, -h           print this usage message and exit with success'
2665   print '  -n                   dry run; parse CVS repos, but do not construct SVN repos'
2666   print '  -v                   verbose'
2667   print '  -s PATH              path for SVN repos'
2668   print '  -p NUM               start at pass NUM of %d' % len(_passes)
2669   print '  --existing-svnrepos  load into existing SVN repository'
2670   print '  --dumpfile=PATH      name of intermediate svn dumpfile'
2671   print '  --svnadmin=PATH      path to the svnadmin program'
2672   print '  --trunk-only         convert only trunk commits, not tags nor branches'
2673   print '  --trunk=PATH         path for trunk (default: %s)'    \
2674         % ctx.trunk_base
2675   print '  --branches=PATH      path for branches (default: %s)' \
2676         % ctx.branches_base
2677   print '  --tags=PATH          path for tags (default: %s)'     \
2678         % ctx.tags_base
2679   print '  --no-prune           don\'t prune empty directories'
2680   print '  --dump-only          just produce a dumpfile, don\'t commit to a repos'
2681   print '  --encoding=ENC       encoding of log messages in CVS repos (default: %s)' \
2682         % ctx.encoding
2683   print '  --username=NAME      username for cvs2svn-synthesized commits'
2684   print '                                                  (default: %s)' \
2685         % ctx.username
2686   print '  --skip-cleanup       prevent the deletion of intermediate files'
2687   print '  --bdb-txn-nosync     pass --bdb-txn-nosync to "svnadmin create"'
2688   print '  --cvs-revnums        record CVS revision numbers as file properties'
2689
2690
2691
2692 def main():
2693   # prepare the operation context
2694   ctx = _ctx()
2695   ctx.cvsroot = None
2696   ctx.target = None
2697   ctx.log_fname_base = DATAFILE
2698   ctx.dumpfile = DUMPFILE
2699   ctx.verbose = 0
2700   ctx.dry_run = 0
2701   ctx.prune = 1
2702   ctx.existing_svnrepos = 0
2703   ctx.dump_only = 0
2704   ctx.trunk_only = 0
2705   ctx.trunk_base = "trunk"
2706   ctx.tags_base = "tags"
2707   ctx.branches_base = "branches"
2708   ctx.encoding = "ascii"
2709   ctx.svnadmin = "svnadmin"
2710   ctx.username = "unknown"
2711   ctx.print_help = 0
2712   ctx.skip_cleanup = 0
2713   ctx.cvs_revnums = 0
2714   ctx.bdb_txn_nosync = 0
2715
2716   start_pass = 1
2717
2718   try:
2719     opts, args = getopt.getopt(sys.argv[1:], 'p:s:vnh',
2720                                [ "help", "create", "trunk=",
2721                                  "username=", "existing-svnrepos",
2722                                  "branches=", "tags=", "encoding=",
2723                                  "trunk-only", "no-prune",
2724                                  "dump-only", "dumpfile=", "svnadmin=",
2725                                  "skip-cleanup", "cvs-revnums",
2726                                  "bdb-txn-nosync"])
2727   except getopt.GetoptError, e:
2728     sys.stderr.write(error_prefix + ': ' + str(e) + '\n\n')
2729     usage(ctx)
2730     sys.exit(1)
2731
2732   for opt, value in opts:
2733     if opt == '-p':
2734       start_pass = int(value)
2735       if start_pass < 1 or start_pass > len(_passes):
2736         print '%s: illegal value (%d) for starting pass. ' \
2737               'must be 1 through %d.' % (error_prefix, start_pass,
2738                                          len(_passes))
2739         sys.exit(1)
2740     elif (opt == '--help') or (opt == '-h'):
2741       ctx.print_help = 1
2742     elif opt == '-v':
2743       ctx.verbose = 1
2744     elif opt == '-n':
2745       ctx.dry_run = 1
2746     elif opt == '-s':
2747       ctx.target = value
2748     elif opt == '--existing-svnrepos':
2749       ctx.existing_svnrepos = 1
2750     elif opt == '--dumpfile':
2751       ctx.dumpfile = value
2752     elif opt == '--svnadmin':
2753       ctx.svnadmin = value
2754     elif opt == '--trunk-only':
2755       ctx.trunk_only = 1
2756     elif opt == '--trunk':
2757       ctx.trunk_base = value
2758     elif opt == '--branches':
2759       ctx.branches_base = value
2760     elif opt == '--tags':
2761       ctx.tags_base = value
2762     elif opt == '--no-prune':
2763       ctx.prune = None
2764     elif opt == '--dump-only':
2765       ctx.dump_only = 1
2766     elif opt == '--encoding':
2767       ctx.encoding = value
2768     elif opt == '--username':
2769       ctx.username = value
2770     elif opt == '--skip-cleanup':
2771       ctx.skip_cleanup = 1
2772     elif opt == '--cvs-revnums':
2773       ctx.cvs_revnums = 1
2774     elif opt == '--bdb-txn-nosync':
2775       ctx.bdb_txn_nosync = 1
2776     elif opt == '--create':
2777       sys.stderr.write(warning_prefix +
2778           ': The behaviour produced by the --create option is now the '
2779           'default,\nand passing the option is deprecated.\n')
2780
2781   if ctx.print_help:
2782     usage(ctx)
2783     sys.exit(0)
2784
2785   # Consistency check for options and arguments.
2786   if len(args) == 0:
2787     usage(ctx)
2788     sys.exit(1)
2789
2790   if len(args) > 1:
2791     sys.stderr.write(error_prefix +
2792                      ": must pass only one CVS repository.\n")
2793     usage(ctx)
2794     sys.exit(1)
2795
2796   ctx.cvsroot = args[0]
2797
2798   if not os.path.isdir(ctx.cvsroot):
2799     sys.stderr.write(error_prefix +
2800                      ": the cvs-repos-path '%s' is not an "
2801                      "existing directory.\n" % ctx.cvsroot)
2802     sys.exit(1)
2803
2804   if (not ctx.target) and (not ctx.dump_only):
2805     sys.stderr.write(error_prefix +
2806                      ": must pass one of '-s' or '--dump-only'.\n")
2807     sys.exit(1)
2808
2809   def not_both(opt1val, opt1name, opt2val, opt2name):
2810     if opt1val and opt2val:
2811       sys.stderr.write(error_prefix + ": cannot pass both '%s' and '%s'.\n" \
2812           % (opt1name, opt2name))
2813
2814   not_both(ctx.target, '-s', ctx.dump_only, '--dump-only')
2815
2816   not_both(ctx.dump_only, '--dump-only',
2817     ctx.existing_svnrepos, '--existing-svnrepos')
2818
2819   not_both(ctx.bdb_txn_nosync, '--bdb-txn-nosync',
2820     ctx.existing_svnrepos, '--existing-svnrepos')
2821
2822   not_both(ctx.dump_only, '--dump-only',
2823     ctx.bdb_txn_nosync, '--bdb-txn-nosync')
2824
2825   if ((string.find(ctx.trunk_base, '/') > -1)
2826       or (string.find(ctx.tags_base, '/') > -1)
2827       or (string.find(ctx.branches_base, '/') > -1)):
2828     sys.stderr.write("%s: cannot pass multicomponent path to "
2829                      "--trunk, --tags, or --branches yet.\n"
2830                      "  See http://subversion.tigris.org/issues/show_bug.cgi?"
2831                      "id=1409 "
2832                      "for details.\n" % error_prefix)
2833     sys.exit(1)
2834
2835   if ctx.existing_svnrepos and not os.path.isdir(ctx.target):
2836     sys.stderr.write(error_prefix +
2837                      ": the svn-repos-path '%s' is not an "
2838                      "existing directory.\n" % ctx.target)
2839     sys.exit(1)
2840
2841   if not ctx.dump_only and not ctx.existing_svnrepos \
2842       and os.path.exists(ctx.target):
2843     sys.stderr.write(error_prefix +
2844                      ": the svn-repos-path '%s' exists.\nRemove it, or pass "
2845                      "'--existing-svnrepos'.\n" % ctx.target)
2846     sys.exit(1)
2847
2848   ctx.default_branches_db = Database(DEFAULT_BRANCHES_DB, 'n')
2849
2850   convert(ctx, start_pass=start_pass)
2851
2852
2853 if __name__ == '__main__':
2854   main()