cvs2svn.py

   1 #!/usr/bin/env python
   2 #
   3 # cvs2svn: ...
   4 #
   5
   6 # $LastChangedRevision$
   7
   8 import rcsparse
   9 import os
  10 import sys
  11 import sha
  12 import re
  13 import time
  14 import fileinput
  15 import string
  16 import getopt
  17 import stat
  18 import md5
  19 import anydbm
  20 import marshal
  21
  22 # Make sure this Python is recent enough.
  23 import sys
  24 if sys.hexversion < 0x2000000:
  25   sys.stderr.write('Python 2.0 or higher is required; see www.python.org.\n')
  26   sys.exit(1)
  27
  28 # Don't settle for less.
  29 if anydbm._defaultmod.__name__ == 'dumbdbm':
  30   print 'ERROR: your installation of Python does not contain a proper'
  31   print '  DBM module. This script cannot continue.'
  32   print '  to solve: see http://python.org/doc/current/lib/module-anydbm.html'
  33   print '  for details.'
  34   sys.exit(1)
  35
  36 trunk_rev = re.compile('^[0-9]+\\.[0-9]+$')
  37 branch_tag = re.compile('^[0-9.]+\\.0\\.[0-9]+$')
  38 vendor_tag = re.compile('^[0-9]+\\.[0-9]+\\.[0-9]+$')
  39
  40 DATAFILE = 'cvs2svn-data'
  41 DUMPFILE = 'cvs2svn-dump'  # The "dumpfile" we create to load into the repos
  42
  43 # Skeleton version of an svn filesystem.
  44 SVN_REVISIONS_DB = 'cvs2svn-revisions.db'
  45 NODES_DB = 'cvs2svn-nodes.db'
  46 SYMBOLIC_NAME_ROOTS_DB = 'cvs2svn-symroots.db'
  47
  48 # See class SymbolicNameTracker for details.
  49 SYMBOLIC_NAMES_DB = "cvs2svn-sym-names.db"
  50
  51 REVS_SUFFIX = '.revs'
  52 CLEAN_REVS_SUFFIX = '.c-revs'
  53 SORTED_REVS_SUFFIX = '.s-revs'
  54 RESYNC_SUFFIX = '.resync'
  55
  56 ATTIC = os.sep + 'Attic'
  57
  58 SVN_INVALID_REVNUM = -1
  59
  60 COMMIT_THRESHOLD = 5 * 60       # flush a commit if a 5 minute gap occurs
  61
  62 OP_NOOP   = '-'
  63 OP_ADD    = 'A'
  64 OP_DELETE = 'D'
  65 OP_CHANGE = 'C'
  66
  67 DIGEST_END_IDX = 9 + (sha.digestsize * 2)
  68
  69 verbose = 1
  70
  71
  72 # Officially, CVS symbolic names must use a fairly restricted set of
  73 # characters.  Unofficially, we don't care if some repositories out
  74 # there don't abide by this, as long as their tags start with a letter
  75 # and don't include '/' or '\' (both of which are prohibited by
  76 # official restrictions anyway).
  77 symbolic_name_re = re.compile('^[a-zA-Z][^/\\\\]*$')
  78
  79 class CollectData(rcsparse.Sink):
  80   def __init__(self, cvsroot, log_fname_base):
  81     self.cvsroot = cvsroot
  82     self.revs = open(log_fname_base + REVS_SUFFIX, 'w')
  83     self.resync = open(log_fname_base + RESYNC_SUFFIX, 'w')
  84
  85   def set_fname(self, fname):
  86     "Prepare to receive data for a new file."
  87     self.fname = fname
  88
  89     # revision -> [timestamp, author, operation, old-timestamp]
  90     self.rev_data = { }
  91     self.prev = { }
  92     self.branch_names = {}
  93     self.taglist = {}
  94     self.branchlist = {}
  95
  96   def set_branch_name(self, revision, name):
  97     """Record that REVISION is the branch number for BRANCH_NAME.
  98     REVISION is an RCS branch number with an odd number of components,
  99     for example '1.7.2' (never '1.7.0.2')."""
 100     if self.branch_names.has_key(revision):
 101       sys.stderr.write("Error while parsing '%s':\n"
 102                        "   branch %s already has name '%s',\n"
 103                        "   cannot also have name '%s'.\n" \
 104                        % (self.fname, revision,
 105                           self.branch_names[revision], name))
 106       sys.exit(1)
 107     self.branch_names[revision] = name
 108
 109   def get_branch_name(self, revision):
 110     """Return the name of the branch on which REVISION lies.
 111     REVISION is a non-branch evision number with an even number of,
 112     components, for example '1.7.2.1' (never '1.7.2' nor '1.7.0.2')."""
 113     return self.branch_names.get(revision[:revision.rindex(".")])
 114
 115   def add_branch_point(self, revision, branch_name):
 116     """Record that BRANCH_NAME sprouts from REVISION.
 117     REVISION is a non-branch revision number with an even number of
 118     components, for example '1.7' (never '1.7.2' nor '1.7.0.2')."""
 119     if not self.branchlist.has_key(revision):
 120       self.branchlist[revision] = []
 121     self.branchlist[revision].append(branch_name)
 122
 123   def add_cvs_branch(self, revision, branch_name):
 124     """Record the root revision and branch revision for BRANCH_NAME,
 125     based on REVISION.  REVISION is a CVS branch number having an even
 126     number of components where the second-to-last is '0'.  For
 127     example, if it's '1.7.0.2', then record that BRANCH_NAME sprouts
 128     from 1.7 and has branch number 1.7.2."""
 129     last_dot = revision.rfind(".")
 130     branch_rev = revision[:last_dot]
 131     last2_dot = branch_rev.rfind(".")
 132     branch_rev = branch_rev[:last2_dot] + revision[last_dot:]
 133     self.set_branch_name(branch_rev, branch_name)
 134     self.add_branch_point(branch_rev[:last2_dot], branch_name)
 135
 136   def get_tags(self, revision):
 137     """Return a list of all tag names attached to REVISION.
 138     REVISION is a regular revision number like '1.7', and the result
 139     never includes branch names, only plain tags."""
 140     return self.taglist.get(revision, [])
 141
 142   def get_branches(self, revision):
 143     """Return a list of all branch names that sprout from REVISION.
 144     REVISION is a regular revision number like '1.7'."""
 145     return self.branchlist.get(revision, [])
 146
 147   def define_tag(self, name, revision):
 148     """Record a bidirectional mapping between symbolic NAME and REVISION
 149     REVISION is an unprocessed revision number from the RCS file's
 150     header, for example: '1.7', '1.7.0.2', or '1.1.1' or '1.1.1.1'.
 151     This function will determine what kind of symbolic name it is by
 152     inspection, and record it in the right places."""
 153     if not symbolic_name_re.match(name):
 154       sys.stderr.write("Error while parsing %s:\n"
 155                        "   '%s' is not a valid tag or branch name.\n" \
 156                        % (self.fname, name))
 157       sys.exit(1)
 158     if branch_tag.match(revision):
 159       self.add_cvs_branch(revision, name)
 160     elif vendor_tag.match(revision):
 161       self.set_branch_name(revision, name)
 162       self.add_branch_point(revision[:revision.rfind(".")], name)
 163     else:
 164       if not self.taglist.has_key(revision):
 165         self.taglist[revision] = []
 166       self.taglist[revision].append(name)
 167
 168   def define_revision(self, revision, timestamp, author, state,
 169                       branches, next):
 170     ### what else?
 171     if state == 'dead':
 172       op = OP_DELETE
 173     else:
 174       op = OP_CHANGE
 175
 176     # store the rev_data as a list in case we have to jigger the timestamp
 177     self.rev_data[revision] = [int(timestamp), author, op, None]
 178
 179     # record the previous revision for sanity checking later
 180     if trunk_rev.match(revision):
 181       self.prev[revision] = next
 182     elif next:
 183       self.prev[next] = revision
 184     for b in branches:
 185       self.prev[b] = revision
 186
 187   def tree_completed(self):
 188     "The revision tree has been parsed. Analyze it for consistency."
 189
 190     # Our algorithm depends upon the timestamps on the revisions occuring
 191     # monotonically over time. That is, we want to see rev 1.34 occur in
 192     # time before rev 1.35. If we inserted 1.35 *first* (due to the time-
 193     # sorting), and then tried to insert 1.34, we'd be screwed.
 194
 195     # to perform the analysis, we'll simply visit all of the 'previous'
 196     # links that we have recorded and validate that the timestamp on the
 197     # previous revision is before the specified revision
 198
 199     # if we have to resync some nodes, then we restart the scan. just keep
 200     # looping as long as we need to restart.
 201     while 1:
 202       for current, prev in self.prev.items():
 203         if not prev:
 204           # no previous revision exists (i.e. the initial revision)
 205           continue
 206         t_c = self.rev_data[current][0]
 207         t_p = self.rev_data[prev][0]
 208         if t_p >= t_c:
 209           # the previous revision occurred later than the current revision.
 210           # shove the previous revision back in time (and any before it that
 211           # may need to shift).
 212           while t_p >= t_c:
 213             self.rev_data[prev][0] = t_c - 1    # new timestamp
 214             self.rev_data[prev][3] = t_p        # old timestamp
 215
 216             print 'RESYNC: %s (%s) : old time="%s" new time="%s"' \
 217                   % (relative_name(self.cvsroot, self.fname),
 218                      prev, time.ctime(t_p), time.ctime(t_c - 1))
 219
 220             current = prev
 221             prev = self.prev[current]
 222             if not prev:
 223               break
 224             t_c = t_c - 1               # self.rev_data[current][0]
 225             t_p = self.rev_data[prev][0]
 226
 227           # break from the for-loop
 228           break
 229       else:
 230         # finished the for-loop (no resyncing was performed)
 231         return
 232
 233   def set_revision_info(self, revision, log, text):
 234     timestamp, author, op, old_ts = self.rev_data[revision]
 235     digest = sha.new(log + '\0' + author).hexdigest()
 236     if old_ts:
 237       # the timestamp on this revision was changed. log it for later
 238       # resynchronization of other files's revisions that occurred
 239       # for this time and log message.
 240       self.resync.write('%08lx %s %08lx\n' % (old_ts, digest, timestamp))
 241
 242     branch_name = self.get_branch_name(revision)
 243
 244     write_revs_line(self.revs, timestamp, digest, op, revision, self.fname,
 245                     branch_name, self.get_tags(revision),
 246                     self.get_branches(revision))
 247
 248
 249 def make_path(ctx, path, branch_name = None, tag_name = None):
 250   """Return the trunk path, branch path, or tag path for PATH.
 251   CTX holds the name of the branches or tags directory, which is
 252   prepended to PATH when constructing a branch or tag path.
 253
 254   If PATH is empty or None, return the root trunk|branch|tag path.
 255
 256   It is an error to pass both a BRANCH_NAME and a TAG_NAME."""
 257
 258   # For a while, we treated each top-level subdir of the CVS
 259   # repository as a "project root" and interpolated the appropriate
 260   # genealogy (trunk|tag|branch) in according to the official
 261   # recommended layout.  For example, the path '/foo/bar/baz.c' on
 262   # branch 'Rel2' would become
 263   #
 264   #   /foo/branches/Rel2/bar/baz.c
 265   #
 266   # and on trunk it would become
 267   #
 268   #   /foo/trunk/bar/baz.c
 269   #
 270   # However, we went back to the older and simpler method of just
 271   # prepending the genealogy to the front, instead of interpolating.
 272   # So now we produce:
 273   #
 274   #   /branches/Rel2/foo/bar/baz.c
 275   #   /trunk/foo/bar/baz.c
 276   #
 277   # Why?  Well, Jack Repenning pointed out that this way is much
 278   # friendlier to "anonymously rooted subtrees" (that's a tree where
 279   # the name of the top level dir doesn't matter, the point is that if
 280   # you cd into it and, say, run 'make', something good will happen).
 281   # By interpolating, we made it impossible to point cvs2svn at some
 282   # subdir in the CVS repository and convert it as a project, because
 283   # we'd treat every subdir underneath it as an independent project
 284   # root, which is probably not what the user wanted.
 285   #
 286   # Also, see Blair Zajac's post
 287   #
 288   #    http://subversion.tigris.org/servlets/ReadMsg?list=dev&msgNo=38965
 289   #
 290   # and the surrounding thread, for why what people really want is a
 291   # way of specifying an in-repository prefix path, not interpolation.
 292
 293   if branch_name and tag_name:
 294     sys.stderr.write('make_path() miscalled: both branch and tag given.\n')
 295     sys.exit(1)
 296
 297   if branch_name:
 298     if path:
 299       return ctx.branches_base + '/' + branch_name + '/' + path
 300     else:
 301       return ctx.branches_base + '/' + branch_name
 302   elif tag_name:
 303     if path:
 304       return ctx.tags_base + '/' + tag_name + '/' + path
 305     else:
 306       return ctx.tags_base + '/' + tag_name
 307   else:
 308     if path:
 309       return ctx.trunk_base + '/' + path
 310     else:
 311       return ctx.trunk_base
 312
 313
 314 def relative_name(cvsroot, fname):
 315   l = len(cvsroot)
 316   if fname[:l] == cvsroot:
 317     if fname[l] == '/':
 318       return fname[l+1:]
 319     return fname[l:]
 320   sys.stderr.write('relative_path("%s", "%s"): fname is not a sub-path of'
 321                    ' cvsroot\n' % (cvsroot, fname))
 322   sys.exit(1)
 323
 324
 325 def visit_file(arg, dirname, files):
 326   cd, p, stats = arg
 327   for fname in files:
 328     if fname[-2:] != ',v':
 329       continue
 330     pathname = os.path.join(dirname, fname)
 331     if dirname[-6:] == ATTIC:
 332       # drop the 'Attic' portion from the pathname
 333       ### we should record this so we can easily insert it back in
 334       cd.set_fname(os.path.join(dirname[:-6], fname))
 335     else:
 336       cd.set_fname(pathname)
 337     if verbose:
 338       print pathname
 339     try:
 340       p.parse(open(pathname, 'rb'), cd)
 341       stats[0] = stats[0] + 1
 342     except rcsparse.common.RCSExpected:
 343       print "Warning: '%s' is not a valid ,v file, ignoring" % pathname
 344
 345
 346 def is_vendor_first_revision(cvs_rev):
 347   """Return true if CVS_REV is the first revision on a vendor branch,
 348   false otherwise.  If CVS_REV has an even number of components, and
 349   last component is 1 and the component before that is odd, then it is
 350   the first revision on a vendor branch."""
 351   c = string.split(cvs_rev, '.')
 352   n = len(c)
 353   if ((n > 2) and (n % 2 == 0) and (c[-1] == '1') and (int(c[-2]) % 2 == 1)):
 354     return 1
 355   else:
 356     return None
 357
 358
 359 class RevInfoParser(rcsparse.Sink):
 360   def __init__(self):
 361     self.authors = { }  # revision -> author
 362     self.logs = { }     # revision -> log message
 363
 364   def define_revision(self, revision, timestamp, author, state,
 365                       branches, next):
 366     self.authors[revision] = author
 367
 368   def set_revision_info(self, revision, log, text):
 369     self.logs[revision] = log
 370
 371   def parse_cvs_file(self, rcs_pathname):
 372     try:
 373       rcsfile = open(rcs_pathname, 'rb')
 374     except:
 375       try:
 376         dirname, fname = os.path.split(rcs_pathname)
 377         rcs_pathname = os.path.join(dirname, "Attic", fname)
 378         rcsfile = open(rcs_pathname, 'rb')
 379       except:
 380         ### should use a better error
 381         raise RuntimeError, ('error: %s appeared to be under CVS control, '
 382                              'but the RCS file is inaccessible.'
 383                              % rcs_pathname)
 384
 385     rcsparse.Parser().parse(rcsfile, self)
 386
 387
 388 # Return a string that has not been returned by gen_key() before.
 389 gen_key_base = 0L
 390 def gen_key():
 391   global gen_key_base
 392   key = '%x' % gen_key_base
 393   gen_key_base = gen_key_base + 1
 394   return key
 395
 396
 397 class Change:
 398   """Class for recording what actually happened when a change is made,
 399   because not all of the result is guessable by the caller.
 400   See RepositoryMirror.change_path() for more.
 401
 402   The fields are
 403
 404     op:
 405        'A' if path was added, 'C' if changed, or '-' if no action.
 406
 407     closed_tags:
 408        List of tags that this path can no longer be the source of,
 409        that is, tags which could be rooted in the path before the
 410        change, but not after.
 411
 412     closed_branches:
 413        Like closed_tags, but for branches.
 414
 415     deleted_entries:
 416        The list of entries deleted from the destination after
 417        copying a directory, or None.
 418
 419     copyfrom_rev:
 420        The actual revision from which the path was copied, which
 421        may be one less than the requested revision when the path
 422        was deleted in the requested revision, or None."""
 423   def __init__(self, op, closed_tags, closed_branches,
 424                deleted_entries=None, copyfrom_rev=None):
 425     self.op = op
 426     self.closed_tags = closed_tags
 427     self.closed_branches = closed_branches
 428     self.deleted_entries = deleted_entries
 429     self.copyfrom_rev = copyfrom_rev
 430
 431
 432 class RepositoryMirror:
 433   def __init__(self):
 434     # This corresponds to the 'revisions' table in a Subversion fs.
 435     self.revs_db_file = SVN_REVISIONS_DB
 436     self.revs_db = anydbm.open(self.revs_db_file, 'n')
 437
 438     # This corresponds to the 'nodes' table in a Subversion fs.  (We
 439     # don't need a 'representations' or 'strings' table because we
 440     # only track metadata, not file contents.
 441     self.nodes_db_file = NODES_DB
 442     self.nodes_db = anydbm.open(self.nodes_db_file, 'n')
 443
 444     # This tracks which symbolic names the current "head" of a given
 445     # filepath could be the origin node for.  When the next commit on
 446     # that path comes along, we can tell which symbolic names
 447     # originated in the previous version, and signal back to the
 448     # caller that the file can no longer be the origin for those names.
 449     #
 450     # The values are marshalled tuples, (tags, branches), where each
 451     # value is a list.
 452     self.symroots_db_file = SYMBOLIC_NAME_ROOTS_DB
 453     self.symroots_db = anydbm.open(self.symroots_db_file, 'n')
 454
 455     # When copying a directory (say, to create part of a branch), we
 456     # pass change_path() a list of expected entries, so it can remove
 457     # any that are in the source but don't belong on the branch.
 458     # However, because creating a given region of a branch can involve
 459     # copying from several sources, we don't want later copy
 460     # operations to delete entries that were legitimately created by
 461     # earlier copy ops.  So after a copy, the directory records
 462     # legitimate entries under this key, in a dictionary (the keys are
 463     # entry names, the values can be ignored).
 464     self.approved_entries = "/approved-entries"
 465
 466     # Set on a directory that's mutable in the revision currently
 467     # being constructed.  (Yes, this is exactly analogous to
 468     # the Subversion filesystem code's concept of mutability.)
 469     self.mutable_flag = "/mutable"
 470     # This could represent a new mutable directory or file.
 471     self.empty_mutable_thang = { self.mutable_flag : 1 }
 472
 473     # Init a root directory with no entries at revision 0.
 474     self.youngest = 0
 475     self.revs_db[str(self.youngest)] = gen_key()
 476     self.nodes_db[self.revs_db[str(self.youngest)]] = marshal.dumps({})
 477
 478   def new_revision(self):
 479     """Stabilize the current revision, then start the next one.
 480     (Increments youngest.)"""
 481     self.stabilize_youngest()
 482     self.revs_db[str(self.youngest + 1)] \
 483                                       = self.revs_db[str(self.youngest)]
 484     self.youngest = self.youngest + 1
 485
 486   def _stabilize_directory(self, key):
 487     """Close the directory whose node key is KEY."""
 488     dir = marshal.loads(self.nodes_db[key])
 489     if dir.has_key(self.mutable_flag):
 490       del dir[self.mutable_flag]
 491       if dir.has_key(self.approved_entries):
 492         del dir[self.approved_entries]
 493       for entry_key in dir.keys():
 494         if not entry_key[0] == '/':
 495           self._stabilize_directory(dir[entry_key])
 496       self.nodes_db[key] = marshal.dumps(dir)
 497
 498   def stabilize_youngest(self):
 499     """Stabilize the current revision by removing mutable flags."""
 500     root_key = self.revs_db[str(self.youngest)]
 501     self._stabilize_directory(root_key)
 502
 503   def probe_path(self, path, revision=-1, debugging=None):
 504     """If PATH exists in REVISION of the svn repository mirror,
 505     return its leaf value, else return None.
 506     If DEBUGGING is true, then print trace output to stdout.
 507     REVISION defaults to youngest, and PATH must not start with '/'."""
 508     components = string.split(path, '/')
 509     if revision == -1:
 510       revision = self.youngest
 511
 512     if debugging:
 513       print "PROBING path: '%s' in %d" % (path, revision)
 514
 515     parent_key = self.revs_db[str(revision)]
 516     parent = marshal.loads(self.nodes_db[parent_key])
 517     previous_component = "/"
 518
 519     i = 1
 520     for component in components:
 521
 522       if debugging:
 523         print "  " * i,
 524         print "'%s' key: %s, val:" % (previous_component, parent_key), parent
 525
 526       if not parent.has_key(component):
 527         if debugging:
 528           print "  PROBE ABANDONED: '%s' does not contain '%s'" \
 529                 % (previous_component, component)
 530         return None
 531
 532       this_entry_key = parent[component]
 533       this_entry_val = marshal.loads(self.nodes_db[this_entry_key])
 534       parent_key = this_entry_key
 535       parent = this_entry_val
 536       previous_component = component
 537       i = i + 1
 538
 539     if debugging:
 540       print "  " * i,
 541       print "parent_key: %s, val:" % parent_key, parent
 542
 543     # It's not actually a parent at this point, it's the leaf node.
 544     return parent
 545
 546   def change_path(self, path, tags, branches,
 547                   intermediate_dir_func=None,
 548                   copyfrom_path=None, copyfrom_rev=None,
 549                   expected_entries=None, only_if_already_exists=None):
 550     """Record a change to PATH.  PATH may not have a leading slash.
 551     Return a Change instance representing the result of the
 552     change.
 553
 554     TAGS are any tags that sprout from this revision of PATH, BRANCHES
 555     are any branches that sprout from this revision of PATH.
 556
 557     If INTERMEDIATE_DIR_FUNC is not None, then invoke it once on
 558     each full path to each missing intermediate directory in PATH, in
 559     order from shortest to longest.
 560
 561     If COPYFROM_REV and COPYFROM_PATH are not None, then they are a
 562     revision and path to record as the copyfrom sources of this node.
 563     Since this implies an 'A'dd, it would be reasonable to error and
 564     exit if the copyfrom args are present but the node also already
 565     exists.  Reasonable -- but not what we do :-).  The most useful
 566     behavior for callers is instead to report that nothing was done,
 567     by returning '-' for Change.op, so that's what we do.
 568
 569     It is an error for only one copyfrom argument to be present.
 570
 571     If EXPECTED_ENTRIES is not None, then it holds entries expected
 572     to be in the dst after the copy.  Any entries in the new dst but
 573     not in EXPECTED_ENTRIES are removed (ignoring keys beginning with
 574     '/'), and the removed entries returned in Change.deleted_entries,
 575     which are otherwise None.
 576
 577     No action is taken for keys in EXPECTED_ENTRIES but not in the
 578     dst; it is assumed that the caller will compensate for these by
 579     calling change_path again with other arguments.
 580
 581     If ONLY_IF_ALREADY_EXISTS is set, then do a no-op, rather than an add,
 582     if the path does not exist. This is to allow pruning using EXPECTED_ENTRIES
 583     without risking erroneously adding a path."""
 584     if ((copyfrom_rev and not copyfrom_path) or
 585         (copyfrom_path and not copyfrom_rev)):
 586       sys.stderr.write("error: change_path() called with one copyfrom "
 587                        "argument but not the other.\n")
 588       sys.exit(1)
 589
 590     components = string.split(path, '/')
 591     path_so_far = None
 592
 593     parent_key = self.revs_db[str(self.youngest)]
 594     parent = marshal.loads(self.nodes_db[parent_key])
 595     if not parent.has_key(self.mutable_flag):
 596       parent_key = gen_key()
 597       parent[self.mutable_flag] = 1
 598       self.nodes_db[parent_key] = marshal.dumps(parent)
 599       self.revs_db[str(self.youngest)] = parent_key
 600
 601     for component in components[:-1]:
 602       # parent is always mutable at the top of the loop
 603
 604       if path_so_far:
 605         path_so_far = path_so_far + '/' + component
 606       else:
 607         path_so_far = component
 608
 609       # Ensure that the parent has an entry for this component.
 610       if not parent.has_key(component):
 611         if only_if_already_exists:
 612           if expected_entries:
 613             return Change(OP_NOOP, [], [], [])
 614           else:
 615             return Change(OP_NOOP, [], [])
 616         # else
 617         new_child_key = gen_key()
 618         parent[component] = new_child_key
 619         self.nodes_db[new_child_key] = marshal.dumps(self.empty_mutable_thang)
 620         self.nodes_db[parent_key] = marshal.dumps(parent)
 621         if intermediate_dir_func:
 622           intermediate_dir_func(path_so_far)
 623
 624       # One way or another, parent dir now has an entry for component,
 625       # so grab it, see if it's mutable, and DTRT if it's not.  (Note
 626       # it's important to reread the entry value from the db, even
 627       # though we might have just written it -- if we tweak existing
 628       # data structures, we could modify self.empty_mutable_thang,
 629       # which must not happen.)
 630       this_entry_key = parent[component]
 631       this_entry_val = marshal.loads(self.nodes_db[this_entry_key])
 632       if not this_entry_val.has_key(self.mutable_flag):
 633         this_entry_val[self.mutable_flag] = 1
 634         this_entry_key = gen_key()
 635         parent[component] = this_entry_key
 636         self.nodes_db[this_entry_key] = marshal.dumps(this_entry_val)
 637         self.nodes_db[parent_key] = marshal.dumps(parent)
 638
 639       parent_key = this_entry_key
 640       parent = this_entry_val
 641
 642     # Now change the last node, the versioned file.  Just like at the
 643     # top of the above loop, parent is already mutable.
 644     op = OP_ADD
 645     if self.symroots_db.has_key(path):
 646       old_names = marshal.loads(self.symroots_db[path])
 647     else:
 648       old_names = [], []
 649     last_component = components[-1]
 650     new_val = { }
 651     if parent.has_key(last_component):
 652       # The contract for copying over existing nodes is to do nothing
 653       # and return:
 654       if copyfrom_path:
 655         if expected_entries:
 656           return Change(OP_NOOP, old_names[0], old_names[1], [])
 657         else:
 658           return Change(OP_NOOP, old_names[0], old_names[1])
 659       # else
 660       op = OP_CHANGE
 661       new_val = marshal.loads(self.nodes_db[parent[last_component]])
 662     elif only_if_already_exists:
 663       if expected_entries:
 664         return Change(OP_NOOP, [], [], [])
 665       else:
 666         return Change(OP_NOOP, [], [])
 667
 668     leaf_key = gen_key()
 669     deletions = []
 670     actual_copy_rev = copyfrom_rev
 671     if copyfrom_path:
 672       new_val = self.probe_path(copyfrom_path, copyfrom_rev)
 673       if new_val is None:
 674         # Sometimes a branch is rooted in a revision that RCS has
 675         # marked as 'dead'.  Since that path will have been deleted in
 676         # the corresponding Subversion revision, we use the revision
 677         # right before it as the copyfrom rev, and return that to the
 678         # caller so it can emit the right dumpfile instructions.
 679         actual_copy_rev = copyfrom_rev - 1
 680         new_val = self.probe_path(copyfrom_path, actual_copy_rev)
 681     if expected_entries:
 682       approved_entries = new_val.get(self.approved_entries) or { }
 683       new_approved_entries = { }
 684       for ent in new_val.keys():
 685         if (ent[0] != '/'):
 686           if (not expected_entries.has_key(ent)
 687               and not approved_entries.has_key(ent)):
 688             del new_val[ent]
 689             deletions.append(ent)
 690           else:
 691             new_approved_entries[ent] = 1
 692       new_val[self.approved_entries] = new_approved_entries
 693     parent[last_component] = leaf_key
 694     self.nodes_db[parent_key] = marshal.dumps(parent)
 695     self.symroots_db[path] = marshal.dumps((tags, branches))
 696     new_val[self.mutable_flag] = 1
 697     self.nodes_db[leaf_key] = marshal.dumps(new_val)
 698
 699     if expected_entries:
 700       return Change(op, old_names[0], old_names[1], deletions, actual_copy_rev)
 701     else:
 702       return Change(op, old_names[0], old_names[1], None, actual_copy_rev)
 703
 704   def delete_path(self, path, tags, branches, prune=None):
 705     """Delete PATH from the tree.  PATH may not have a leading slash.
 706
 707     Return a tuple (path_deleted, closed_tags, closed_branches), where
 708     path_deleted is the path actually deleted or None if PATH did not
 709     exist, and closed_tags and closed_branches are lists of symbolic
 710     names closed off by this deletion -- that is, tags or branches
 711     which could be rooted in the previous revision of PATH, but not in
 712     this revision, because this rev changes PATH.  If path_deleted is
 713     None, then closed_tags and closed_branches will both be empty.
 714
 715     TAGS are any tags that sprout from this revision of PATH, BRANCHES
 716     are any branches that sprout from this revision of PATH.  (I can't
 717     imagine that there are any of either, what to do if there are?)
 718
 719     If PRUNE is not None, then delete the highest possible directory,
 720     which means the returned path may differ from PATH.  In other
 721     words, if PATH was the last entry in its parent, then delete
 722     PATH's parent, unless it too is the last entry in *its* parent, in
 723     which case delete that parent, and and so on up the chain, until a
 724     directory is encountered that has an entry which is not a member
 725     of the parent stack of the original target.
 726
 727     PRUNE is like the -P option to 'cvs checkout'."""
 728
 729     components = string.split(path, '/')
 730     path_so_far = None
 731
 732     # Start out assuming that we will delete it.  The for-loop may
 733     # change this to None, if it turns out we can't even reach the
 734     # path (i.e., it is already deleted).
 735     retval = path
 736
 737     parent_key = self.revs_db[str(self.youngest)]
 738     parent = marshal.loads(self.nodes_db[parent_key])
 739
 740     # As we walk down to find the dest, we remember each parent
 741     # directory's name and db key, in reverse order: push each new key
 742     # onto the front of the list, so that by the time we reach the
 743     # destination node, the zeroth item in the list is the parent of
 744     # that destination.
 745     #
 746     # Then if we actually do the deletion, we walk the list from left
 747     # to right, replacing as appropriate.
 748     #
 749     # The root directory has name None.
 750     parent_chain = [ ]
 751     parent_chain.insert(0, (None, parent_key))
 752
 753     def is_prunable(dir):
 754       """Return true if DIR, a dictionary representing a directory,
 755       has just zero or one non-special entry, else return false.
 756       (In a pure world, we'd just ask len(DIR) > 1; it's only
 757       because the directory might have mutable flags and other special
 758       entries that we need this function at all.)"""
 759       num_items = len(dir)
 760       if num_items > 3:
 761         return None
 762       if num_items == 3 or num_items == 2:
 763         real_entries = 0
 764         for key in dir.keys():
 765           if not key[0] == '/': real_entries = real_entries + 1
 766         if real_entries > 1:
 767           return None
 768         else:
 769           return 1
 770       else:
 771         return 1
 772
 773     for component in components[:-1]:
 774       # parent is always mutable at the top of the loop
 775
 776       if path_so_far:
 777         path_so_far = path_so_far + '/' + component
 778       else:
 779         path_so_far = component
 780
 781       # If we can't reach the dest, then we don't need to do anything.
 782       if not parent.has_key(component):
 783         return None, [], []
 784
 785       # Otherwise continue downward, dropping breadcrumbs.
 786       this_entry_key = parent[component]
 787       this_entry_val = marshal.loads(self.nodes_db[this_entry_key])
 788       parent_key = this_entry_key
 789       parent = this_entry_val
 790       parent_chain.insert(0, (component, parent_key))
 791
 792     # If the target is not present in its parent, then we're done.
 793     last_component = components[-1]
 794     old_names = [], []
 795     if not parent.has_key(last_component):
 796       return None, [], []
 797     elif self.symroots_db.has_key(path):
 798       old_names = marshal.loads(self.symroots_db[path])
 799       del self.symroots_db[path]
 800
 801     # The target is present, so remove it and bubble up, making a new
 802     # mutable path and/or pruning as necessary.
 803     pruned_count = 0
 804     prev_entry_name = last_component
 805     new_key = None
 806     for parent_item in parent_chain:
 807       pkey = parent_item[1]
 808       pval = marshal.loads(self.nodes_db[pkey])
 809       if prune and (new_key is None) and is_prunable(pval):
 810         pruned_count = pruned_count + 1
 811         pass
 812         # Do nothing more.  All the action takes place when we hit a
 813         # non-prunable parent.
 814       else:
 815         # We hit a non-prunable, or aren't pruning, so bubble up the new gospel.
 816         pval[self.mutable_flag] = 1
 817         if new_key is None:
 818           del pval[prev_entry_name]
 819         else:
 820           pval[prev_entry_name] = new_key
 821         new_key = gen_key()
 822
 823       prev_entry_name = parent_item[0]
 824       if new_key:
 825         self.nodes_db[new_key] = marshal.dumps(pval)
 826
 827     if new_key is None:
 828       new_key = gen_key()
 829       self.nodes_db[new_key] = marshal.dumps(self.empty_mutable_thang)
 830
 831     # Install the new root entry.
 832     self.revs_db[str(self.youngest)] = new_key
 833
 834     if pruned_count > len(components):
 835       sys.stderr.write("Error: deleting '%s' tried to prune %d components.\n"
 836                        % (path, pruned_count))
 837       sys.exit(1)
 838
 839     if pruned_count:
 840       if pruned_count == len(components):
 841         # We never prune away the root directory, so back up one component.
 842         pruned_count = pruned_count - 1
 843       retpath = string.join(components[:0 - pruned_count], '/')
 844     else:
 845       retpath = path
 846
 847     return retpath, old_names[0], old_names[1]
 848
 849     ### We've no place to put tags + branches.  Suspect we just
 850     ### shouldn't be taking them as arguments, which the doc string
 851     ### implies already.  Ponder.
 852
 853   def close(self):
 854     # Just stabilize the last revision.  This may or may not affect
 855     # anything, but if we end up using the mirror for anything after
 856     # this, it's nice to know the '/mutable' entries are gone.
 857     self.stabilize_youngest()
 858
 859
 860 class Dumper:
 861   def __init__(self, dumpfile_path):
 862     'Open DUMPFILE_PATH, and initialize revision to REVISION.'
 863     self.dumpfile_path = dumpfile_path
 864     self.revision = 0
 865     self.dumpfile = open(dumpfile_path, 'wb')
 866     self.repos_mirror = RepositoryMirror()
 867
 868     # Initialize the dumpfile with the standard headers:
 869     #
 870     # The CVS repository doesn't have a UUID, and the Subversion
 871     # repository will be created with one anyway.  So when we load
 872     # the dumpfile, we'll tell svnadmin to ignore the UUID below.
 873     self.dumpfile.write('SVN-fs-dump-format-version: 2\n'
 874                         '\n')
 875
 876   def start_revision(self, props):
 877     """Write the next revision, with properties, to the dumpfile.
 878     Return the newly started revision."""
 879
 880     self.revision = self.revision + 1
 881
 882     # A revision typically looks like this:
 883     #
 884     #   Revision-number: 1
 885     #   Prop-content-length: 129
 886     #   Content-length: 129
 887     #
 888     #   K 7
 889     #   svn:log
 890     #   V 27
 891     #   Log message for revision 1.
 892     #   K 10
 893     #   svn:author
 894     #   V 7
 895     #   jrandom
 896     #   K 8
 897     #   svn:date
 898     #   V 27
 899     #   2003-04-22T22:57:58.132837Z
 900     #   PROPS-END
 901     #
 902     # Notice that the length headers count everything -- not just the
 903     # length of the data but also the lengths of the lengths, including
 904     # the 'K ' or 'V ' prefixes.
 905     #
 906     # The reason there are both Prop-content-length and Content-length
 907     # is that the former includes just props, while the latter includes
 908     # everything.  That's the generic header form for any entity in a
 909     # dumpfile.  But since revisions only have props, the two lengths
 910     # are always the same for revisions.
 911
 912     # Calculate the total length of the props section.
 913     total_len = 10  # len('PROPS-END\n')
 914     for propname in props.keys():
 915       klen = len(propname)
 916       klen_len = len('K %d' % klen)
 917       vlen = len(props[propname])
 918       vlen_len = len('V %d' % vlen)
 919       # + 4 for the four newlines within a given property's section
 920       total_len = total_len + klen + klen_len + vlen + vlen_len + 4
 921
 922     # Print the revision header and props
 923     self.dumpfile.write('Revision-number: %d\n'
 924                         'Prop-content-length: %d\n'
 925                         'Content-length: %d\n'
 926                         '\n'
 927                         % (self.revision, total_len, total_len))
 928
 929     for propname in props.keys():
 930       self.dumpfile.write('K %d\n'
 931                           '%s\n'
 932                           'V %d\n'
 933                           '%s\n' % (len(propname),
 934                                     propname,
 935                                     len(props[propname]),
 936                                     props[propname]))
 937
 938     self.dumpfile.write('PROPS-END\n')
 939     self.dumpfile.write('\n')
 940
 941     self.repos_mirror.new_revision()
 942     return self.revision
 943
 944   def add_dir(self, path):
 945     self.dumpfile.write("Node-path: %s\n"
 946                         "Node-kind: dir\n"
 947                         "Node-action: add\n"
 948                         "Prop-content-length: 10\n"
 949                         "Content-length: 10\n"
 950                         "\n"
 951                         "PROPS-END\n"
 952                         "\n"
 953                         "\n" % path)
 954
 955   def probe_path(self, path):
 956     """Return true if PATH exists in the youngest tree of the svn
 957     repository, else return None.  PATH does not start with '/'."""
 958     if self.repos_mirror.probe_path(path) is None:
 959       return None
 960     else:
 961       return 1
 962
 963   def copy_path(self, svn_src_path, svn_src_rev, svn_dst_path, entries=None):
 964     """Emit a copy of SVN_SRC_PATH at SVN_SRC_REV to SVN_DST_PATH.
 965     If ENTRIES is not None, it is a dictionary whose keys are the full
 966     set of entries the new copy is expected to have -- and therefore
 967     any entries in the new dst but not in ENTRIES will be removed.
 968     (Keys in ENTRIES beginning with '/' are ignored.)
 969
 970     No action is taken for keys in ENTRIES but not in the dst; it is
 971     assumed that the caller will compensate for these by calling
 972     copy_path again with other arguments."""
 973     change = self.repos_mirror.change_path(svn_dst_path,
 974                                            [], [],
 975                                            self.add_dir,
 976                                            svn_src_path, svn_src_rev,
 977                                            entries)
 978     if change.op == 'A':
 979       # We don't need to include "Node-kind:" for copies; the loader
 980       # ignores it anyway and just uses the source kind instead.
 981       self.dumpfile.write('Node-path: %s\n'
 982                           'Node-action: add\n'
 983                           'Node-copyfrom-rev: %d\n'
 984                           'Node-copyfrom-path: /%s\n'
 985                           '\n'
 986                           % (svn_dst_path, change.copyfrom_rev, svn_src_path))
 987
 988       for ent in change.deleted_entries:
 989         self.dumpfile.write('Node-path: %s\n'
 990                             'Node-action: delete\n'
 991                             '\n' % (svn_dst_path + '/' + ent))
 992
 993   def prune_entries(self, path, expected):
 994     """Delete any entries in PATH that are not in list EXPECTED.
 995     PATH need not be a directory, but of course nothing will happen if
 996     it's a file.  Entries beginning with '/' are ignored as usual."""
 997     change = self.repos_mirror.change_path(path,
 998                                            [], [],
 999                                            self.add_dir,
1000                                            None, None,
1001                                            expected, 1)
1002     for ent in change.deleted_entries:
1003       self.dumpfile.write('Node-path: %s\n'
1004                           'Node-action: delete\n'
1005                           '\n' % (path + '/' + ent))
1006
1007   def add_or_change_path(self, cvs_path, svn_path, cvs_rev, rcs_file,
1008                          tags, branches):
1009
1010     # figure out the real file path for "co"
1011     try:
1012       f_st = os.stat(rcs_file)
1013     except os.error:
1014       dirname, fname = os.path.split(rcs_file)
1015       rcs_file = os.path.join(dirname, 'Attic', fname)
1016       f_st = os.stat(rcs_file)
1017
1018     if f_st[0] & stat.S_IXUSR:
1019       is_executable = 1
1020       # "K 14\n" + "svn:executable\n" + "V 1\n" + "*\n" + "PROPS-END\n"
1021       props_len = 36
1022     else:
1023       is_executable = 0
1024       # just "PROPS-END\n"
1025       props_len = 10
1026
1027     ### FIXME: We ought to notice the -kb flag set on the RCS file and
1028     ### use it to set svn:mime-type.
1029
1030     basename = os.path.basename(rcs_file[:-2])
1031     pipe = os.popen('co -q -p%s \'%s\''
1032                     % (cvs_rev, rcs_file.replace("'", "'\\''")), 'r')
1033
1034     # You might think we could just test
1035     #
1036     #   if cvs_rev[-2:] == '.1':
1037     #
1038     # to determine if this path exists in head yet.  But that wouldn't
1039     # be perfectly reliable, both because of 'cvs commit -r', and also
1040     # the possibility of file resurrection.
1041     change = self.repos_mirror.change_path(svn_path, tags, branches,
1042                                            self.add_dir)
1043
1044     if change.op == OP_ADD:
1045       action = 'add'
1046     else:
1047       action = 'change'
1048
1049     self.dumpfile.write('Node-path: %s\n'
1050                         'Node-kind: file\n'
1051                         'Node-action: %s\n'
1052                         'Prop-content-length: %d\n'
1053                         'Text-content-length: '
1054                         % (svn_path, action, props_len))
1055
1056     pos = self.dumpfile.tell()
1057
1058     self.dumpfile.write('0000000000000000\n'
1059                         'Text-content-md5: 00000000000000000000000000000000\n'
1060                         'Content-length: 0000000000000000\n'
1061                         '\n')
1062
1063     if is_executable:
1064       self.dumpfile.write('K 14\n'
1065                           'svn:executable\n'
1066                           'V 1\n'
1067                           '*\n')
1068
1069     self.dumpfile.write('PROPS-END\n')
1070
1071     # Insert the rev contents, calculating length and checksum as we go.
1072     checksum = md5.new()
1073     length = 0
1074     buf = pipe.read()
1075     while buf:
1076       checksum.update(buf)
1077       length = length + len(buf)
1078       self.dumpfile.write(buf)
1079       buf = pipe.read()
1080     pipe.close()
1081
1082     # Go back to patch up the length and checksum headers:
1083     self.dumpfile.seek(pos, 0)
1084     # We left 16 zeros for the text length; replace them with the real
1085     # length, padded on the left with spaces:
1086     self.dumpfile.write('%16d' % length)
1087     # 16... + 1 newline + len('Text-content-md5: ') == 35
1088     self.dumpfile.seek(pos + 35, 0)
1089     self.dumpfile.write(checksum.hexdigest())
1090     # 35... + 32 bytes of checksum + 1 newline + len('Content-length: ') == 84
1091     self.dumpfile.seek(pos + 84, 0)
1092     # The content length is the length of property data, text data,
1093     # and any metadata around/inside around them.
1094     self.dumpfile.write('%16d' % (length + props_len))
1095     # Jump back to the end of the stream
1096     self.dumpfile.seek(0, 2)
1097
1098     # This record is done.
1099     self.dumpfile.write('\n')
1100     return change.closed_tags, change.closed_branches
1101
1102   def delete_path(self, svn_path, tags, branches, prune=None):
1103     """If SVN_PATH exists in the head mirror, output the deletion to
1104     the dumpfile, else output nothing to the dumpfile.
1105
1106     Return a tuple (path_deleted, closed_tags, closed_branches), where
1107     path_deleted is the path deleted if any or None if no deletion was
1108     necessary, and closed_tags and closed_names are lists of symbolic
1109     names closed off by this deletion -- that is, tags or branches
1110     which could be rooted in the previous revision of PATH, but not in
1111     this revision, because this rev changes PATH.  If path_deleted is
1112     None, then closed_tags and closed_branches will both be empty.
1113
1114     Iff PRUNE is true, then the path deleted can be not None, yet
1115     shorter than SVN_PATH because of pruning."""
1116     deleted_path, closed_tags, closed_branches \
1117                   = self.repos_mirror.delete_path(svn_path, tags,
1118                                                   branches, prune)
1119     if deleted_path:
1120       print '    (deleted %s)' % deleted_path
1121       self.dumpfile.write('Node-path: %s\n'
1122                           'Node-action: delete\n'
1123                           '\n' % deleted_path)
1124     return deleted_path, closed_tags, closed_branches
1125
1126   def close(self):
1127     self.repos_mirror.close()
1128     self.dumpfile.close()
1129
1130
1131 def format_date(date):
1132   """Return an svn-compatible date string for DATE (seconds since epoch)."""
1133   # A Subversion date looks like "2002-09-29T14:44:59.000000Z"
1134   return time.strftime("%Y-%m-%dT%H:%M:%S.000000Z", time.gmtime(date))
1135
1136
1137 def make_revision_props(symbolic_name, is_tag):
1138   """Return a dictionary of revision properties for the manufactured
1139   commit that finished SYMBOLIC_NAME.  If IS_TAG is true, write the
1140   log message as though for a tag, else as though for a branch."""
1141   if is_tag:
1142     type = 'tag'
1143   else:
1144     type = 'branch'
1145
1146   # In Python 2.2.3, we could use textwrap.fill().  Oh well :-).
1147   if len(symbolic_name) >= 13:
1148     space_or_newline = '\n'
1149   else:
1150     space_or_newline = ' '
1151
1152   log = "This commit was manufactured by cvs2svn to create %s%s'%s'." \
1153         % (type, space_or_newline, symbolic_name)
1154
1155   return { 'svn:author' : 'unknown',
1156            'svn:log' : log,
1157            'svn:date' : format_date(time.time())}
1158
1159
1160 class SymbolicNameTracker:
1161   """Track the Subversion path/revision ranges of CVS symbolic names.
1162   This is done in a .db file, representing a tree in the usual way.
1163   In addition to directory entries, each object in the database stores
1164   the earliest revision from which it could be copied, and the first
1165   revision from which it could no longer be copied.  Intermediate
1166   directories go one step farther: they record counts for the various
1167   revisions from which items under them could have been copied, and
1168   counts for the cutoff revisions.  For example:
1169
1170                                .----------.
1171                                |  sub1    | [(2, 1), (3, 3)]
1172                                |  /       | [(5, 1), (17, 2), (50, 1)]
1173                                | /        |
1174                                |/ sub2    |
1175                                /    \     |
1176                               /|_____\____|
1177                              /        \
1178                       ______/          \_________
1179                      /                           \
1180                     /                             \
1181                    /                               \
1182               .---------.                     .---------.
1183               |  file1  |                     |  file3  |
1184               |   /     | [(3, 2)]            |     \   | [(2, 1), (3, 1)]
1185               |  /      | [(17, 1), (50, 1)]  |      \  | [(5, 1), (10, 1)]
1186               | /       |                     |       \ |
1187               |/ file2  |                     |  file4 \|
1188               /    \    |                     |    /    \
1189              /|_____\___|                     |___/_____|\
1190             /        \                           /        \
1191            /          \                         /          \
1192           /            \                       /            \
1193          /              +                     /              +
1194     +======+            |                 +======+           |
1195     |      | [(3, 1)]   |                 |      | [(2, 1)]  |
1196     |      | [(17, 1)]  |                 |      | [(5, 1)]  |
1197     |      |            |                 |      |           |
1198     +======+            |                 +======+           |
1199                     +======+                             +======+
1200                     |      | [(3, 1)]                    |      | [(3, 1)]
1201                     |      | [(50, 1)]                   |      | [(17, 1)]
1202                     |      |                             |      |
1203                     +======+                             +======+
1204
1205   The two lists to the right of each node represent the 'opening' and
1206   'closing' revisions respectively.  Each tuple in a list is of the
1207   form (REV, COUNT).  For leaf nodes, COUNT is always 1, of course.
1208   For intermediate nodes, the counts are the sums of the corresponding
1209   counts of child nodes.
1210
1211   These revision scores are used to determine the optimal copy
1212   revisions for each tree/subtree at branch or tag creation time.
1213
1214   The svn path input will most often be a trunk path, because the
1215   path/rev information recorded here is about where and when the given
1216   symbolic name could be rooted, *not* a path/rev for which commits
1217   along that symbolic name take place (of course, commits only happen on
1218   branches anyway)."""
1219
1220   def __init__(self):
1221     self.db_file = SYMBOLIC_NAMES_DB
1222     self.db = anydbm.open(self.db_file, 'n')
1223     self.root_key = gen_key()
1224     self.db[self.root_key] = marshal.dumps({})
1225
1226     # The keys for the opening and closing revision lists attached to
1227     # each directory or file.  Includes "/" so as never to conflict
1228     # with any real entry.
1229     self.tags_opening_revs_key = "/tag-openings"
1230     self.tags_closing_revs_key = "/tag-closings"
1231     self.br_opening_revs_key   = "/br-openings"
1232     self.br_closing_revs_key   = "/br-closings"
1233
1234     # When a node is copied into the repository, the revision copied
1235     # is stored under the appropriate key, and the corresponding
1236     # opening and closing rev lists are removed.
1237     self.tags_copyfrom_rev_key = "/tags-copyfrom-rev"
1238     self.br_copyfrom_rev_key = "/br-copyfrom-rev"
1239
1240   def probe_path(self, symbolic_name, path, debugging=None):
1241     """If 'SYMBOLIC_NAME/PATH' exists in the symbolic name tree,
1242     return the value of its last component, else return None.
1243     PATH may be None, but may not start with '/'.
1244     If DEBUGGING is true, then print trace output to stdout."""
1245     if path:
1246       components = [symbolic_name] + string.split(path, '/')
1247     else:
1248       components = [symbolic_name]
1249
1250     if debugging:
1251       print "PROBING SYMBOLIC NAME:\n", components
1252
1253     parent_key = self.root_key
1254     parent = marshal.loads(self.db[parent_key])
1255     last_component = "/"
1256     i = 1
1257     for component in components:
1258       if debugging:
1259         print "  " * i,
1260         print "'%s' key: %s, val:" % (last_component, parent_key), parent
1261
1262       if not parent.has_key(component):
1263         sys.stderr.write("SYM PROBE FAILED: '%s' does not contain '%s'\n" \
1264                          % (last_component, component))
1265         sys.exit(1)
1266
1267       this_entry_key = parent[component]
1268       this_entry_val = marshal.loads(self.db[this_entry_key])
1269       parent_key = this_entry_key
1270       parent = this_entry_val
1271       last_component = component
1272       i = i + 1
1273
1274     if debugging:
1275       print "  " * i,
1276       print "parent_key: %s, val:" % parent_key, parent
1277
1278     # It's not actually a parent at this point, it's the leaf node.
1279     return parent
1280
1281   def bump_rev_count(self, item_key, rev, revlist_key):
1282     """Increment REV's count in opening or closing list under KEY.
1283     REVLIST_KEY is self.*_opening_revs_key or self.*_closing_revs_key,
1284     and indicates which rev list to increment REV's count in.
1285
1286     For example, if REV is 7, REVLIST_KEY is
1287     self.tags_opening_revs_key, and the entry's tags opening revs list
1288     looks like this
1289
1290          [(2, 5), (7, 2), (10, 15)]
1291
1292     then afterwards it would look like this:
1293
1294          [(2, 5), (7, 3), (10, 15)]
1295
1296     But if no tuple for revision 7 were present, then one would be
1297     added, for example
1298
1299          [(2, 5), (10, 15)]
1300
1301     would become
1302
1303          [(2, 5), (7, 1), (10, 15)]
1304
1305     The list is sorted by ascending revision both before and after."""
1306
1307     entry_val = marshal.loads(self.db[item_key])
1308
1309     if not entry_val.has_key(revlist_key):
1310       entry_val[revlist_key] = [(rev, 1)]
1311     else:
1312       rev_counts = entry_val[revlist_key]
1313       for i in range(len(rev_counts)):
1314         this_rev, this_count = rev_counts[i]
1315         if rev == this_rev:
1316           rev_counts[i] = (this_rev, this_count + 1)
1317           break
1318         elif this_rev > rev:
1319           if i > 0:
1320             i = i - 1
1321           rev_counts.insert(i, (rev, 1))
1322           break
1323       else:
1324         rev_counts.append((rev, 1))
1325       entry_val[revlist_key] = rev_counts
1326
1327     self.db[item_key] = marshal.dumps(entry_val)
1328
1329   # The verb form of "root" is "root", but that would be misleading in
1330   # this case; and the opposite of "uproot" is presumably "downroot",
1331   # but that wouldn't exactly clarify either.  Hence, "enroot" :-).
1332   def enroot_names(self, svn_path, svn_rev, names, opening_key):
1333     """Record SVN_PATH at SVN_REV as the earliest point from which the
1334     symbolic names in NAMES could be copied.  OPENING_KEY is
1335     self.tags_opening_revs_key or self.br_opening_revs_key, to
1336     indicate whether NAMES contains tag names or branch names.
1337     SVN_PATH does not start with '/'."""
1338
1339     # Guard against names == None
1340     if not names:
1341       return
1342
1343     for name in names:
1344       components = [name] + string.split(svn_path, '/')
1345       parent_key = self.root_key
1346       for component in components:
1347         self.bump_rev_count(parent_key, svn_rev, opening_key)
1348         parent = marshal.loads(self.db[parent_key])
1349         if not parent.has_key(component):
1350           new_child_key = gen_key()
1351           parent[component] = new_child_key
1352           self.db[new_child_key] = marshal.dumps({})
1353           self.db[parent_key] = marshal.dumps(parent)
1354         # One way or another, parent now has an entry for component.
1355         this_entry_key = parent[component]
1356         this_entry_val = marshal.loads(self.db[this_entry_key])
1357         # Swaparoo.
1358         parent_key = this_entry_key
1359         parent = this_entry_val
1360
1361       self.bump_rev_count(parent_key, svn_rev, opening_key)
1362
1363   def enroot_tags(self, svn_path, svn_rev, tags):
1364     """Record SVN_PATH at SVN_REV as the earliest point from which the
1365     symbolic names in TAGS could be copied.  SVN_PATH does not start
1366     with '/'."""
1367     self.enroot_names(svn_path, svn_rev, tags, self.tags_opening_revs_key)
1368
1369   def enroot_branches(self, svn_path, svn_rev, branches):
1370     """Record SVN_PATH at SVN_REV as the earliest point from which the
1371     symbolic names in BRANCHES could be copied.  SVN_PATH does not
1372     start with '/'."""
1373     self.enroot_names(svn_path, svn_rev, branches, self.br_opening_revs_key)
1374
1375   def close_names(self, svn_path, svn_rev, names, closing_key):
1376     """Record that as of SVN_REV, SVN_PATH could no longer be the
1377     source from which any of symbolic names in NAMES could be copied.
1378     CLOSING_KEY is self.tags_closing_revs_key or
1379     self.br_closing_revs_key, to indicate whether NAMES are tags or
1380     branches.  SVN_PATH does not start with '/'."""
1381
1382     # Guard against names == None
1383     if not names:
1384       return
1385
1386     for name in names:
1387       components = [name] + string.split(svn_path, '/')
1388       parent_key = self.root_key
1389       for component in components:
1390         self.bump_rev_count(parent_key, svn_rev, closing_key)
1391         parent = marshal.loads(self.db[parent_key])
1392         if not parent.has_key(component):
1393           sys.stderr.write("In path '%s', value for parent key '%s' "
1394                            "does not have entry '%s'\n" \
1395                            % (svn_path, parent_key, component))
1396           sys.exit(1)
1397         this_entry_key = parent[component]
1398         this_entry_val = marshal.loads(self.db[this_entry_key])
1399         # Swaparoo.
1400         parent_key = this_entry_key
1401         parent = this_entry_val
1402
1403       self.bump_rev_count(parent_key, svn_rev, closing_key)
1404
1405   def close_tags(self, svn_path, svn_rev, tags):
1406     """Record that as of SVN_REV, SVN_PATH could no longer be the
1407     source from which any of TAGS could be copied.  SVN_PATH does not
1408     start with '/'."""
1409     self.close_names(svn_path, svn_rev, tags, self.tags_closing_revs_key)
1410
1411   def close_branches(self, svn_path, svn_rev, branches):
1412     """Record that as of SVN_REV, SVN_PATH could no longer be the
1413     source from which any of BRANCHES could be copied.  SVN_PATH does
1414     not start with '/'."""
1415     self.close_names(svn_path, svn_rev, branches, self.br_closing_revs_key)
1416
1417   def score_revisions(self, openings, closings):
1418     """Return a list of revisions and scores based on OPENINGS and
1419     CLOSINGS.  The returned list looks like:
1420
1421        [(REV1 SCORE1), (REV2 SCORE2), ...]
1422
1423     where REV2 > REV1 and all scores are > 0.  OPENINGS and CLOSINGS
1424     are the values of self.tags_opening_revs_key and
1425     self.tags_closing_revs_key, or self.br_opening_revs_key and
1426     self.br_closing_revs_key, from some file or directory node, or
1427     else None.
1428
1429     Each score indicates that copying the corresponding revision of
1430     the object in question would yield that many correct paths at or
1431     underneath the object.  There may be other paths underneath it
1432     which are not correct and need to be deleted or recopied; those
1433     can only be detected by descending and examining their scores.
1434
1435     If OPENINGS is false, return the empty list."""
1436
1437     # First look for easy outs.
1438     if not openings:
1439       return []
1440
1441     # Must be able to call len(closings) below.
1442     if closings is None:
1443       closings = []
1444
1445     # No easy out, so wish for lexical closures and calculate the scores :-).
1446     scores = []
1447     opening_score_accum = 0
1448     for i in range(len(openings)):
1449       pair = openings[i]
1450       opening_score_accum = opening_score_accum + pair[1]
1451       scores.append((pair[0], opening_score_accum))
1452     min = 0
1453     for i in range(len(closings)):
1454       closing_rev   = closings[i][0]
1455       closing_score = closings[i][1]
1456       for j in range(min, len(scores)):
1457         opening_pair = scores[j]
1458         if closing_rev <= opening_pair[0]:
1459           scores[j] = (opening_pair[0], opening_pair[1] - closing_score)
1460         else:
1461           min = j + 1
1462     return scores
1463
1464   def best_rev(self, scores):
1465     """Return the revision with the highest score from SCORES, a list
1466     returned by score_revisions()."""
1467     max_score = 0
1468     rev = SVN_INVALID_REVNUM
1469     for pair in scores:
1470       if pair[1] > max_score:
1471         max_score = pair[1]
1472         rev = pair[0]
1473     return rev
1474
1475   # Helper for fill_branch().
1476   def copy_descend(self, dumper, ctx, name, parent, entry_name,
1477                    parent_rev, src_path, dst_path, is_tag, jit_new_rev=None):
1478     """Starting with ENTRY_NAME in directory object PARENT at
1479     PARENT_REV, use DUMPER and CTX to copy nodes in the Subversion
1480     repository, manufacturing the source paths with SRC_PATH and the
1481     destination paths with NAME and DST_PATH.
1482
1483     If IS_TAG is true, NAME is treated as a tag, else as a branch.
1484
1485     If JIT_NEW_REV is not None, it is a list of one element.  If that
1486     element is true, then if any copies are to be made, invoke
1487     DUMPER.start_revision() before the first copy, then set
1488     JIT_NEW_REV[0] to None, so no more new revisions are made for this
1489     symbolic name anywhere in this descent.
1490
1491     ('JIT' == 'Just In Time'.)"""
1492     ### Hmmm, is passing [1] instead of 1 an idiomatic way of passing
1493     ### a side-effectable boolean in Python?  That's how the
1494     ### JIT_NEW_REV parameter works here and elsewhere, but maybe
1495     ### there's a clearer way to do it?
1496
1497     key = parent[entry_name]
1498     val = marshal.loads(self.db[key])
1499
1500     if is_tag:
1501       opening_key = self.tags_opening_revs_key
1502       closing_key = self.tags_closing_revs_key
1503       copyfrom_rev_key = self.tags_copyfrom_rev_key
1504     else:
1505       opening_key = self.br_opening_revs_key
1506       closing_key = self.br_closing_revs_key
1507       copyfrom_rev_key = self.br_copyfrom_rev_key
1508
1509     if not val.has_key(copyfrom_rev_key):
1510       # If not already copied this subdir, calculate its "best rev"
1511       # and see if it differs from parent's best rev.
1512       scores = self.score_revisions(val.get(opening_key), val.get(closing_key))
1513       rev = self.best_rev(scores)
1514
1515       if rev == SVN_INVALID_REVNUM:
1516         return  # name is a branch, but we're doing a tag, or vice versa
1517
1518       else:
1519         if is_tag:
1520           copy_dst = make_path(ctx, dst_path, None, name)
1521         else:
1522           copy_dst = make_path(ctx, dst_path, name, None)
1523
1524         if (rev != parent_rev):
1525           parent_rev = rev
1526           if jit_new_rev and jit_new_rev[0]:
1527             dumper.start_revision(make_revision_props(name, is_tag))
1528             jit_new_rev[0] = None
1529           dumper.copy_path(src_path, parent_rev, copy_dst, val)
1530           # Record that this copy is done:
1531           val[copyfrom_rev_key] = parent_rev
1532           if val.has_key(opening_key):
1533             del val[opening_key]
1534           if val.has_key(closing_key):
1535             del val[closing_key]
1536           self.db[key] = marshal.dumps(val)
1537         else:
1538           # Even if we kept the already-present revision of this entry
1539           # instead of copying a new one, we still need to prune out
1540           # anything that's not part of the symbolic name.
1541           dumper.prune_entries(copy_dst, val)
1542
1543     for ent in val.keys():
1544       if not ent[0] == '/':
1545         if src_path:
1546           next_src = src_path + '/' + ent
1547         else:
1548           next_src = ent
1549         if dst_path:
1550           next_dst = dst_path + '/' + ent
1551         else:
1552           next_dst = ent
1553         self.copy_descend(dumper, ctx, name, val, ent, parent_rev,
1554                           next_src, next_dst, is_tag, jit_new_rev)
1555
1556   def fill_name(self, dumper, ctx, name, is_tag, jit_new_rev=None):
1557     """Use DUMPER to create all currently available parts of symbolic
1558     name NAME that have not been created already.
1559
1560     If IS_TAG is true, NAME is treated as a tag, else as a branch.
1561
1562     If JIT_NEW_REV is not None, it is a list of one element.  If that
1563     element is true, then if any copies are to be made, invoke
1564     DUMPER.start_revision() before the first copy.
1565
1566     ('JIT' == 'Just In Time'.)"""
1567
1568     # A source path looks like this in the symbolic name tree:
1569     #
1570     #    thisbranch/trunk/proj/foo/bar/baz.c
1571     #
1572     # ...or occasionally...
1573     #
1574     #    thisbranch/branches/sourcebranch/proj/foo/bar/baz.c
1575     #
1576     # (the latter when 'thisbranch' is branched off 'sourcebranch').
1577     #
1578     # Meanwhile, we're copying to a location in the repository like
1579     #
1580     #    /branches/thisbranch/proj/foo/bar/baz.c    or
1581     #    /tags/tagname/proj/foo/bar/baz.c
1582     #
1583     # Of course all this depends on make_path()'s behavior.  At
1584     # various times we've changed the way it produces paths (see
1585     # revisions 6028 and 6347).  If it changes again, the logic here
1586     # must be adjusted to match.
1587
1588     parent_key = self.root_key
1589     parent = marshal.loads(self.db[parent_key])
1590
1591     if not parent.has_key(name):
1592       if is_tag:
1593         sys.stderr.write("No origin records for tag '%s'.\n" % name)
1594       else:
1595         sys.stderr.write("No origin records for branch '%s'.\n" % name)
1596       sys.exit(1)
1597
1598     parent_key = parent[name]
1599     parent = marshal.loads(self.db[parent_key])
1600
1601     # All Subversion source paths under the branch start with one of
1602     # three things:
1603     #
1604     #   /trunk/...
1605     #   /branches/foo/...
1606     #   /tags/foo/...
1607     #
1608     # (We don't care what foo is, it's just a component to skip over.)
1609     #
1610     # Since these don't all have the same number of components, we
1611     # manually descend into each as far as necessary, then invoke
1612     # copy_descend() once we're in the right place in both trees.
1613     #
1614     # Since it's possible for a branch or tag to have some source
1615     # paths on trunk and some on branches, there's some question about
1616     # what to copy as the top-level directory of the branch.  Our
1617     # solution is to [somewhat randomly] give preference to trunk.
1618     # Note that none of these paths can ever conflict; for example,
1619     # it would be impossible to have both
1620     #
1621     #   thisbranch/trunk/myproj/lib/drivers.c                   and
1622     #   thisbranch/branches/sourcebranch/myproj/lib/drivers.c
1623     #
1624     # because that would imply that the symbolic name 'thisbranch'
1625     # appeared twice in the RCS file header, referring to two
1626     # different revisions.  Well, I suppose that's *possible*, but its
1627     # effect is undefined, and it's as reasonable for us to just
1628     # overwrite one with the other as anything else -- anyway, isn't
1629     # that what CVS would do if you checked out the branch?  <shrug>
1630
1631     if parent.has_key(ctx.trunk_base):
1632       self.copy_descend(dumper, ctx, name, parent, ctx.trunk_base,
1633                         SVN_INVALID_REVNUM, ctx.trunk_base, "",
1634                         is_tag, jit_new_rev)
1635     if parent.has_key(ctx.branches_base):
1636       branch_base_key = parent[ctx.branches_base]
1637       branch_base = marshal.loads(self.db[branch_base_key])
1638       for this_source in branch_base.keys():
1639         # We skip special names beginning with '/' for the usual
1640         # reason.  We skip cases where (this_source == name) for a
1641         # different reason: if a CVS branch were rooted in itself,
1642         # that would imply that the same symbolic name appeared on two
1643         # different branches in an RCS file, which CVS doesn't
1644         # permit.  So while it wouldn't hurt to descend, it would be a
1645         # waste of time.
1646         if (this_source[0] != '/') and (this_source != name):
1647           src_path = ctx.branches_base + '/' + this_source
1648           self.copy_descend(dumper, ctx, name, branch_base, this_source,
1649                             SVN_INVALID_REVNUM, src_path, "",
1650                             is_tag, jit_new_rev)
1651
1652   def fill_tag(self, dumper, ctx, tag, jit_new_rev=None):
1653     """Use DUMPER to create all currently available parts of TAG that
1654     have not been created already.  Use CTX.trunk_base, CTX.tags_base,
1655     and CTX.branches_base to determine the source and destination
1656     paths in the Subversion repository.
1657
1658     If JIT_NEW_REV is not None, it is a list of one element.  If that
1659     element is true, then if any copies are to be made, invoke
1660     DUMPER.start_revision() before the first copy.
1661
1662     ('JIT' == 'Just In Time'.)"""
1663     self.fill_name(dumper, ctx, tag, 1, jit_new_rev)
1664
1665   def fill_branch(self, dumper, ctx, branch, jit_new_rev=None):
1666     """Use DUMPER to create all currently available parts of BRANCH that
1667     haven't been created already.  Use CTX.trunk_base, CTX.tags_base,
1668     and CTX.branches_base to determine the source and destination
1669     paths in the Subversion repository.
1670
1671     If JIT_NEW_REV is not None, it is a list of one element.  If that
1672     element is true, then if any copies are to be made, invoke
1673     DUMPER.start_revision() before the first copy.
1674
1675     ('JIT' == 'Just In Time'.)"""
1676     self.fill_name(dumper, ctx, branch, None, jit_new_rev)
1677
1678   def finish(self, dumper, ctx):
1679     """Use DUMPER to finish branches and tags that have either
1680     not been created yet, or have been only partially created.
1681     Use CTX.trunk_base, CTX.tags_base, and CTX.branches_base to
1682     determine the source and destination paths in the Subversion
1683     repository."""
1684     parent_key = self.root_key
1685     parent = marshal.loads(self.db[parent_key])
1686     # Do all branches first, then all tags.  We don't bother to check
1687     # here whether a given name is a branch or a tag, or is done
1688     # already; the fill_foo() methods will just do nothing if there's
1689     # nothing to do.
1690     #
1691     # We do one revision per branch or tag, for clarity to users, not
1692     # for correctness.  In CVS, when you make a branch off a branch,
1693     # the new branch will just root itself in the roots of the old
1694     # branch *except* where the new branch sprouts from a revision
1695     # that was actually committed on the old branch.  In the former
1696     # cases, the source paths will be the same as the source paths
1697     # from which the old branch was created and therefore will already
1698     # exist; and in the latter case, the source paths will actually be
1699     # on the old branch, but those paths will exist already because
1700     # they were commits on that branch and therefore cvs2svn must have
1701     # created it already (see the fill_branch call in Commit.commit).
1702     # So either way, the source paths exist by the time we need them.
1703     #
1704     ### It wouldn't be so awfully hard to determine whether a name is
1705     ### just a branch or just a tag, which would allow for more
1706     ### intuitive messages below.
1707     if not ctx.trunk_only:
1708       print "Finishing branches:"
1709       for name in parent.keys():
1710         if name[0] != '/':
1711           print "finishing '%s' as branch" % name
1712           self.fill_branch(dumper, ctx, name, [1])
1713     print "Finishing tags:"
1714     for name in parent.keys():
1715       if name[0] != '/':
1716         print "finishing '%s' as tag" % name
1717         self.fill_tag(dumper, ctx, name, [1])
1718
1719
1720 class Commit:
1721   def __init__(self):
1722     self.files = { }
1723     self.changes = [ ]
1724     self.deletes = [ ]
1725     self.t_min = 1<<30
1726     self.t_max = 0
1727
1728   def has_file(self, fname):
1729     return self.files.has_key(fname)
1730
1731   def add(self, t, op, file, rev, branch_name, tags, branches):
1732     # Record the time range of this commit.
1733     #
1734     # ### ISSUE: It's possible, though unlikely, that the time range
1735     # of a commit could get gradually expanded to be arbitrarily
1736     # longer than COMMIT_THRESHOLD.  I'm not sure this is a huge
1737     # problem, and anyway deciding where to break it up would be a
1738     # judgement call. For now, we just print a warning in commit() if
1739     # this happens.
1740     if t < self.t_min:
1741       self.t_min = t
1742     if t > self.t_max:
1743       self.t_max = t
1744
1745     if op == OP_CHANGE:
1746       self.changes.append((file, rev, branch_name, tags, branches))
1747     else:
1748       # OP_DELETE
1749       self.deletes.append((file, rev, branch_name, tags, branches))
1750     self.files[file] = 1
1751
1752   def get_metadata(self):
1753     # by definition, the author and log message must be the same for all
1754     # items that went into this commit. therefore, just grab any item from
1755     # our record of changes/deletes.
1756     if self.changes:
1757       file, rev, br, tags, branches = self.changes[0]
1758     else:
1759       # there better be one...
1760       file, rev, br, tags, branches = self.deletes[0]
1761
1762     # now, fetch the author/log from the ,v file
1763     rip = RevInfoParser()
1764     rip.parse_cvs_file(file)
1765     author = rip.authors[rev]
1766     log = rip.logs[rev]
1767     # and we already have the date, so just format it
1768     date = format_date(self.t_max)
1769
1770     return author, log, date
1771
1772   def commit(self, dumper, ctx, sym_tracker):
1773     # commit this transaction
1774     seconds = self.t_max - self.t_min
1775     print 'committing: %s, over %d seconds' % (time.ctime(self.t_min), seconds)
1776     if seconds > COMMIT_THRESHOLD:
1777       print 'WARNING: commit spans more than %d seconds' % COMMIT_THRESHOLD
1778
1779     if ctx.dry_run:
1780       for f, r, br, tags, branches in self.changes:
1781         # compute a repository path, dropping the ,v from the file name
1782         svn_path = make_path(ctx, relative_name(ctx.cvsroot, f[:-2]), br)
1783         print '    adding or changing %s : %s' % (r, svn_path)
1784       for f, r, br, tags, branches in self.deletes:
1785         # compute a repository path, dropping the ,v from the file name
1786         svn_path = make_path(ctx, relative_name(ctx.cvsroot, f[:-2]), br)
1787         print '    deleting %s : %s' % (r, svn_path)
1788       print '    (skipped; dry run enabled)'
1789       return
1790
1791     do_copies = [ ]
1792
1793     # get the metadata for this commit
1794     author, log, date = self.get_metadata()
1795     try:
1796       ### FIXME: The 'replace' behavior should be an option, like
1797       ### --encoding is.
1798       unicode_author = unicode(author, ctx.encoding, 'replace')
1799       unicode_log = unicode(log, ctx.encoding, 'replace')
1800       props = { 'svn:author' : unicode_author.encode('utf8'),
1801                 'svn:log' : unicode_log.encode('utf8'),
1802                 'svn:date' : date }
1803     except UnicodeError:
1804       print 'Problem encoding author or log message:'
1805       print "  author: '%s'" % author
1806       print "  log:    '%s'" % log
1807       print "  date:   '%s'" % date
1808       for rcs_file, cvs_rev, br, tags, branches in self.changes:
1809         print "    rev %s of '%s'" % (cvs_rev, rcs_file)
1810       print 'Try rerunning with (for example) \"--encoding=latin1\".'
1811       sys.exit(1)
1812
1813     # Tells whether we actually wrote anything to the dumpfile.
1814     svn_rev = SVN_INVALID_REVNUM
1815
1816     for rcs_file, cvs_rev, br, tags, branches in self.changes:
1817       # compute a repository path, dropping the ,v from the file name
1818       cvs_path = relative_name(ctx.cvsroot, rcs_file[:-2])
1819       svn_path = make_path(ctx, cvs_path, br)
1820       if svn_rev == SVN_INVALID_REVNUM:
1821         svn_rev = dumper.start_revision(props)
1822       sym_tracker.enroot_tags(svn_path, svn_rev, tags)
1823       sym_tracker.enroot_branches(svn_path, svn_rev, branches)
1824       if br:
1825         ### FIXME: Here is an obvious optimization point.  Probably
1826         ### dump.probe_path(PATH) is kind of slow, because it does N
1827         ### database lookups for the N components in PATH.  If this
1828         ### turns out to be a performance bottleneck, we can just
1829         ### maintain a database mirroring just the head tree, but
1830         ### keyed on full paths, to reduce the check to a quick
1831         ### constant time query.
1832         if not dumper.probe_path(svn_path):
1833           sym_tracker.fill_branch(dumper, ctx, br)
1834       # The first revision on a vendor branch is always the same as
1835       # the revision from which the branch sprouts, e.g., 1.1.1.1 is
1836       # always the same as 1.1, so there's no need to further modify
1837       # 1.1.1.1 from however it is in the copy from 1.1.
1838       if not (br and is_vendor_first_revision(cvs_rev)):
1839         print '    adding or changing %s : %s' % (cvs_rev, svn_path)
1840         closed_tags, closed_branches = dumper.add_or_change_path(cvs_path,
1841                                                                  svn_path,
1842                                                                  cvs_rev,
1843                                                                  rcs_file,
1844                                                                  tags,
1845                                                                  branches)
1846         sym_tracker.close_tags(svn_path, svn_rev, closed_tags)
1847         sym_tracker.close_branches(svn_path, svn_rev, closed_branches)
1848
1849     for rcs_file, cvs_rev, br, tags, branches in self.deletes:
1850       # compute a repository path, dropping the ,v from the file name
1851       cvs_path = relative_name(ctx.cvsroot, rcs_file[:-2])
1852       svn_path = make_path(ctx, cvs_path, br)
1853       print '    deleting %s : %s' % (cvs_rev, svn_path)
1854       if cvs_rev != '1.1':
1855         if svn_rev == SVN_INVALID_REVNUM:
1856           svn_rev = dumper.start_revision(props)
1857         # Uh, can this even happen on a deleted path?  Hmmm.  If not,
1858         # there's no risk, since tags and branches would just be empty
1859         # and therefore enrooting would be a no-op.  Still, it would
1860         # be clearer to know for sure and simply not call it.
1861         sym_tracker.enroot_tags(svn_path, svn_rev, tags)
1862         sym_tracker.enroot_branches(svn_path, svn_rev, branches)
1863         ### FIXME: this will return path_deleted == None if no path
1864         ### was deleted.  But we'll already have started the revision
1865         ### by then, so it's a bit late to use the knowledge!  Need to
1866         ### reorganize things so that starting the revision is a
1867         ### callback with its own internal conditional, so anyone can
1868         ### just invoke when they know they're really about to do
1869         ### something.
1870         ###
1871         ### Right now what happens is we get an empty revision
1872         ### (assuming nothing else happened in this revision).
1873         path_deleted, closed_tags, closed_branches = \
1874                       dumper.delete_path(svn_path, tags, branches, ctx.prune)
1875         sym_tracker.close_tags(svn_path, svn_rev, closed_tags)
1876         sym_tracker.close_branches(svn_path, svn_rev, closed_branches)
1877
1878     if svn_rev != SVN_INVALID_REVNUM:
1879       print '    new revision:', svn_rev
1880     else:
1881       print '    no new revision created, as nothing to do'
1882
1883
1884 def read_resync(fname):
1885   "Read the .resync file into memory."
1886
1887   ### note that we assume that we can hold the entire resync file in
1888   ### memory. really large repositories with whacky timestamps could
1889   ### bust this assumption. should that ever happen, then it is possible
1890   ### to split the resync file into pieces and make multiple passes,
1891   ### using each piece.
1892
1893   #
1894   # A digest maps to a sequence of lists which specify a lower and upper
1895   # time bound for matching up the commit. We keep a sequence of these
1896   # because a number of checkins with the same log message (e.g. an empty
1897   # log message) could need to be remapped. We also make them a list because
1898   # we will dynamically expand the lower/upper bound as we find commits
1899   # that fall into a particular msg and time range.
1900   #
1901   # resync == digest -> [ [old_time_lower, old_time_upper, new_time], ... ]
1902   #
1903   resync = { }
1904
1905   for line in fileinput.FileInput(fname):
1906     t1 = int(line[:8], 16)
1907     digest = line[9:DIGEST_END_IDX]
1908     t2 = int(line[DIGEST_END_IDX+1:], 16)
1909     t1_l = t1 - COMMIT_THRESHOLD/2
1910     t1_u = t1 + COMMIT_THRESHOLD/2
1911     if resync.has_key(digest):
1912       resync[digest].append([t1_l, t1_u, t2])
1913     else:
1914       resync[digest] = [ [t1_l, t1_u, t2] ]
1915
1916   # For each digest, sort the resync items in it in increasing order,
1917   # based on the lower time bound.
1918   digests = resync.keys()
1919   for digest in digests:
1920     (resync[digest]).sort()
1921
1922   return resync
1923
1924
1925 def parse_revs_line(line):
1926   data = line.split(' ', 6)
1927   timestamp = int(data[0], 16)
1928   id = data[1]
1929   op = data[2]
1930   rev = data[3]
1931   branch_name = data[4]
1932   if branch_name == "*":
1933     branch_name = None
1934   ntags = int(data[5])
1935   tags = data[6].split(' ', ntags + 1)
1936   nbranches = int(tags[ntags])
1937   branches = tags[ntags + 1].split(' ', nbranches)
1938   fname = branches[nbranches][:-1]  # strip \n
1939   tags = tags[:ntags]
1940   branches = branches[:nbranches]
1941
1942   return timestamp, id, op, rev, fname, branch_name, tags, branches
1943
1944
1945 def write_revs_line(output, timestamp, digest, op, revision, fname,
1946                     branch_name, tags, branches):
1947   output.write('%08lx %s %s %s ' % (timestamp, digest, op, revision))
1948   if not branch_name:
1949     branch_name = "*"
1950   output.write('%s ' % branch_name)
1951   output.write('%d ' % (len(tags)))
1952   for tag in tags:
1953     output.write('%s ' % (tag))
1954   output.write('%d ' % (len(branches)))
1955   for branch in branches:
1956     output.write('%s ' % (branch))
1957   output.write('%s\n' % fname)
1958
1959
1960 def pass1(ctx):
1961   cd = CollectData(ctx.cvsroot, DATAFILE)
1962   p = rcsparse.Parser()
1963   stats = [ 0 ]
1964   os.path.walk(ctx.cvsroot, visit_file, (cd, p, stats))
1965   if ctx.verbose:
1966     print 'processed', stats[0], 'files'
1967
1968
1969 def pass2(ctx):
1970   "Pass 2: clean up the revision information."
1971
1972   # We may have recorded some changes in revisions' timestamp. We need to
1973   # scan for any other files which may have had the same log message and
1974   # occurred at "the same time" and change their timestamps, too.
1975
1976   # read the resync data file
1977   resync = read_resync(ctx.log_fname_base + RESYNC_SUFFIX)
1978
1979   output = open(ctx.log_fname_base + CLEAN_REVS_SUFFIX, 'w')
1980
1981   # process the revisions file, looking for items to clean up
1982   for line in fileinput.FileInput(ctx.log_fname_base + REVS_SUFFIX):
1983     timestamp, digest, op, rev, fname, branch_name, tags, branches = \
1984       parse_revs_line(line)
1985     if not resync.has_key(digest):
1986       output.write(line)
1987       continue
1988
1989     # we have a hit. see if this is "near" any of the resync records we
1990     # have recorded for this digest [of the log message].
1991     for record in resync[digest]:
1992       if record[0] <= timestamp <= record[1]:
1993         # bingo! remap the time on this (record[2] is the new time).
1994         write_revs_line(output, record[2], digest, op, rev, fname,
1995                         branch_name, tags, branches)
1996
1997         print 'RESYNC: %s (%s) : old time="%s" new time="%s"' \
1998               % (relative_name(ctx.cvsroot, fname),
1999                  rev, time.ctime(timestamp), time.ctime(record[2]))
2000
2001         # adjust the time range. we want the COMMIT_THRESHOLD from the
2002         # bounds of the earlier/latest commit in this group.
2003         record[0] = min(record[0], timestamp - COMMIT_THRESHOLD/2)
2004         record[1] = max(record[1], timestamp + COMMIT_THRESHOLD/2)
2005
2006         # stop looking for hits
2007         break
2008     else:
2009       # the file/rev did not need to have its time changed.
2010       output.write(line)
2011
2012
2013 def pass3(ctx):
2014   # sort the log files
2015   os.system('sort %s > %s' % (ctx.log_fname_base + CLEAN_REVS_SUFFIX,
2016                               ctx.log_fname_base + SORTED_REVS_SUFFIX))
2017
2018
2019 def pass4(ctx):
2020   sym_tracker = SymbolicNameTracker()
2021
2022   # A dictionary of Commit objects, keyed by digest.  Each object
2023   # represents one logical commit, which may involve multiple files.
2024   #
2025   # The reason this is a dictionary, not a single object, is that
2026   # there may be multiple commits interleaved in time.  A commit can
2027   # span up to COMMIT_THRESHOLD seconds, which leaves plenty of time
2028   # for parts of some other commit to occur.  Since the s-revs file is
2029   # sorted by timestamp first, then by digest within each timestamp,
2030   # it's quite easy to have interleaved commits.
2031   commits = { }
2032
2033   # The total number of separate commits processed.  This is used only for
2034   # printing statistics, it does not affect the results in the repository.
2035   count = 0
2036
2037   # Start the dumpfile object.
2038   dumper = Dumper(ctx.dumpfile)
2039
2040   # process the logfiles, creating the target
2041   for line in fileinput.FileInput(ctx.log_fname_base + SORTED_REVS_SUFFIX):
2042     timestamp, id, op, rev, fname, branch_name, tags, branches = \
2043       parse_revs_line(line)
2044
2045     if ctx.trunk_only and not trunk_rev.match(rev):
2046       ### note this could/should have caused a flush, but the next item
2047       ### will take care of that for us
2048       continue
2049
2050     # Each time we read a new line, we scan the commits we've
2051     # accumulated so far to see if any are ready for processing now.
2052     process = [ ]
2053     for scan_id, scan_c in commits.items():
2054
2055       # ### ISSUE: the has_file() check below is not optimal.
2056       # It does fix the dataloss bug where revisions would get lost
2057       # if checked in too quickly, but it can also break apart the
2058       # commits. The correct fix would require tracking the dependencies
2059       # between change sets and committing them in proper order.
2060       if scan_c.t_max + COMMIT_THRESHOLD < timestamp or \
2061          scan_c.has_file(fname):
2062         process.append((scan_c.t_max, scan_c))
2063         del commits[scan_id]
2064
2065     # If there are any elements in 'process' at this point, they need
2066     # to be committed, because this latest rev couldn't possibly be
2067     # part of any of them.  Sort them into time-order, then commit 'em.
2068     process.sort()
2069     for t_max, c in process:
2070       c.commit(dumper, ctx, sym_tracker)
2071     count = count + len(process)
2072
2073     # Add this item into the set of still-available commits.
2074     if commits.has_key(id):
2075       c = commits[id]
2076     else:
2077       c = commits[id] = Commit()
2078     c.add(timestamp, op, fname, rev, branch_name, tags, branches)
2079
2080   # End of the sorted revs file.  Flush any remaining commits:
2081   if commits:
2082     process = [ ]
2083     for id, c in commits.items():
2084       process.append((c.t_max, c))
2085     process.sort()
2086     for t_max, c in process:
2087       c.commit(dumper, ctx, sym_tracker)
2088     count = count + len(process)
2089
2090   # Create (or complete) any branches and tags not already done.
2091   sym_tracker.finish(dumper, ctx)
2092
2093   dumper.close()
2094
2095   if ctx.verbose:
2096     print count, 'commits processed.'
2097
2098
2099 def pass5(ctx):
2100   # on a dry or dump-only run, there is nothing really to do in pass 5
2101   if ctx.dry_run or ctx.dump_only:
2102     return
2103
2104   # create the target repository is so requested
2105   if ctx.create_repos:
2106     os.system('%s create %s' % (ctx.svnadmin, ctx.target))
2107
2108   # now, load the dumpfile into the repository
2109   print 'loading %s into %s' % (ctx.dumpfile, ctx.target)
2110   os.system('%s load %s < %s'
2111             % (ctx.svnadmin, ctx.target, ctx.dumpfile))
2112
2113
2114 _passes = [
2115   pass1,
2116   pass2,
2117   pass3,
2118   pass4,
2119   pass5,
2120   ]
2121
2122
2123 class _ctx:
2124   pass
2125
2126
2127 def convert(ctx, start_pass=1):
2128   "Convert a CVS repository to an SVN repository."
2129
2130   times = [ None ] * len(_passes)
2131   for i in range(start_pass - 1, len(_passes)):
2132     times[i] = time.time()
2133     if verbose:
2134       print '----- pass %d -----' % (i + 1)
2135     _passes[i](ctx)
2136   times.append(time.time())
2137
2138   if verbose:
2139     for i in range(start_pass, len(_passes)+1):
2140       print 'pass %d: %d seconds' % (i, int(times[i] - times[i-1]))
2141     print ' total:', int(times[len(_passes)] - times[start_pass-1]), 'seconds'
2142
2143
2144 def usage(ctx):
2145   print 'USAGE: %s [-n] [-v] [-s svn-repos-path] [-p pass] cvs-repos-path' \
2146         % os.path.basename(sys.argv[0])
2147   print '  -n               dry run; parse CVS repos, but do not construct SVN repos'
2148   print '  -v               verbose'
2149   print '  -s PATH          path for SVN repos'
2150   print '  -p NUM           start at pass NUM of %d' % len(_passes)
2151   print '  --create         create a new SVN repository'
2152   print '  --dumpfile=PATH  name of intermediate svn dumpfile'
2153   print '  --svnadmin=PATH  path to the svnadmin program'
2154   print '  --trunk-only     convert only trunk commits, not tags nor branches'
2155   print '  --trunk=PATH     path for trunk (default: %s)'    \
2156         % ctx.trunk_base
2157   print '  --branches=PATH  path for branches (default: %s)' \
2158         % ctx.branches_base
2159   print '  --tags=PATH      path for tags (default: %s)'     \
2160         % ctx.tags_base
2161   print '  --no-prune       don\'t prune empty directories'
2162   print '  --dump-only      just produce a dumpfile, don\'t commit to a repos'
2163   print '  --encoding=ENC   encoding of log messages in CVS repos (default: %s)' % ctx.encoding
2164   sys.exit(1)
2165
2166
2167 def main():
2168   # prepare the operation context
2169   ctx = _ctx()
2170   ctx.cvsroot = None
2171   ctx.target = None
2172   ctx.log_fname_base = DATAFILE
2173   ctx.dumpfile = DUMPFILE
2174   ctx.verbose = 0
2175   ctx.dry_run = 0
2176   ctx.prune = 1
2177   ctx.create_repos = 0
2178   ctx.dump_only = 0
2179   ctx.trunk_only = 0
2180   ctx.trunk_base = "trunk"
2181   ctx.tags_base = "tags"
2182   ctx.branches_base = "branches"
2183   ctx.encoding = "ascii"
2184   ctx.svnadmin = "svnadmin"
2185
2186   try:
2187     opts, args = getopt.getopt(sys.argv[1:], 'p:s:vn',
2188                                [ "create", "trunk=",
2189                                  "branches=", "tags=", "encoding=",
2190                                  "trunk-only", "no-prune", "dump-only"])
2191   except getopt.GetoptError:
2192     usage(ctx)
2193   if len(args) != 1:
2194     usage(ctx)
2195
2196   ctx.cvsroot = args[0]
2197   start_pass = 1
2198
2199   for opt, value in opts:
2200     if opt == '-p':
2201       start_pass = int(value)
2202       if start_pass < 1 or start_pass > len(_passes):
2203         print 'ERROR: illegal value (%d) for starting pass. ' \
2204               'must be 1 through %d.' % (start_pass, len(_passes))
2205         sys.exit(1)
2206     elif opt == '-v':
2207       ctx.verbose = 1
2208     elif opt == '-n':
2209       ctx.dry_run = 1
2210     elif opt == '-s':
2211       ctx.target = value
2212     elif opt == '--create':
2213       ctx.create_repos = 1
2214     elif opt == '--dumpfile':
2215       ctx.dumpfile = value
2216     elif opt == '--svnadmin':
2217       ctx.svnadmin = value
2218     elif opt == '--trunk-only':
2219       ctx.trunk_only = 1
2220     elif opt == '--trunk':
2221       ctx.trunk_base = value
2222     elif opt == '--branches':
2223       ctx.branches_base = value
2224     elif opt == '--tags':
2225       ctx.tags_base = value
2226     elif opt == '--no-prune':
2227       ctx.prune = None
2228     elif opt == '--dump-only':
2229       ctx.dump_only = 1
2230     elif opt == '--encoding':
2231       ctx.encoding = value
2232
2233   # Consistency check for options.
2234   if (not ctx.target) and (not ctx.dump_only):
2235     sys.stderr.write("Error: must pass one of '-s' or '--dump-only'.\n")
2236     sys.exit(1)
2237
2238   if ctx.target and ctx.dump_only:
2239     sys.stderr.write("Error: cannot pass both '-s' and '--dump-only'.\n")
2240     sys.exit(1)
2241
2242   if ctx.create_repos and ctx.dump_only:
2243     sys.stderr.write("Error: cannot pass both '--create' and '--dump-only'.\n")
2244     sys.exit(1)
2245
2246   if ((string.find(ctx.trunk_base, '/') > -1)
2247       or (string.find(ctx.tags_base, '/') > -1)
2248       or (string.find(ctx.branches_base, '/') > -1)):
2249     sys.stderr.write("Error: cannot pass multicomponent path to ")
2250     sys.stderr.write("--trunk, --tags, or --branches yet.\n")
2251     sys.stderr.write("  See http://subversion.tigris.org/issues/show_bug.cgi?")
2252     sys.stderr.write("id=1409 ")
2253     sys.stderr.write("for details.\n")
2254     sys.exit(1)
2255
2256   convert(ctx, start_pass=start_pass)
2257
2258
2259 if __name__ == '__main__':
2260   main()