Fix a bug I accidentally introduced in r857.
[cvs2svn.git] / cvs2svn.py
blobd694372171cd7c95e5d3416052745c070f84396a
1 #!/usr/bin/env python
3 # cvs2svn: ...
5 # $LastChangedRevision$
7 # ====================================================================
8 # Copyright (c) 2000-2004 CollabNet. All rights reserved.
10 # This software is licensed as described in the file COPYING, which
11 # you should have received as part of this distribution. The terms
12 # are also available at http://subversion.tigris.org/license-1.html.
13 # If newer versions of this license are posted there, you may use a
14 # newer version instead, at your option.
16 # This software consists of voluntary contributions made by many
17 # individuals. For exact contribution history, see the revision
18 # history and logs, available at http://cvs2svn.tigris.org/.
19 # ====================================================================
21 import rcsparse
22 import os
23 import sys
24 import sha
25 import re
26 import time
27 import fileinput
28 import string
29 import getopt
30 import stat
31 import string
32 import md5
33 import anydbm
34 import marshal
36 # Warnings and errors start with these strings. They are typically
37 # followed by a colon and a space, as in "%s: " ==> "Warning: ".
38 warning_prefix = "Warning"
39 error_prefix = "Error"
41 # Make sure this Python is recent enough.
42 if sys.hexversion < 0x2000000:
43 sys.stderr.write("'%s: Python 2.0 or higher required, "
44 "see www.python.org.\n" % error_prefix)
45 sys.exit(1)
47 # Don't settle for less.
48 if (anydbm._defaultmod.__name__ == 'dumbdbm'
49 or anydbm._defaultmod.__name__ == 'dbm'):
50 print 'ERROR: your installation of Python does not contain a suitable'
51 print ' DBM module. This script cannot continue.'
52 print ' to solve: see http://python.org/doc/current/lib/module-anydbm.html'
53 print ' for details.'
54 sys.exit(1)
56 if hasattr(anydbm._defaultmod, 'bsddb') \
57 and not hasattr(anydbm._defaultmod.bsddb, '__version__'):
58 try:
59 gdbm = __import__('gdbm')
60 except ImportError:
61 sys.stderr.write(warning_prefix +
62 ': The version of the bsddb module found '
63 'on your computer has been reported to malfunction on some datasets, '
64 'causing KeyError exceptions. You may wish to upgrade your Python to '
65 'version 2.3 or later.\n')
66 else:
67 anydbm._defaultmod = gdbm
69 trunk_rev = re.compile('^[0-9]+\\.[0-9]+$')
70 branch_tag = re.compile('^[0-9.]+\\.0\\.[0-9]+$')
71 vendor_tag = re.compile('^[0-9]+\\.[0-9]+\\.[0-9]+$')
73 # This really only matches standard '1.1.1.*'-style vendor revisions.
74 # One could conceivably have a file whose default branch is 1.1.3 or
75 # whatever, or was that at some point in time, with vendor revisions
76 # 1.1.3.1, 1.1.3.2, etc. But with the default branch gone now (which
77 # is the only time this regexp gets used), we'd have no basis for
78 # assuming that the non-standard vendor branch had ever been the
79 # default branch anyway, so we don't want this to match them anyway.
80 vendor_revision = re.compile('^(1\\.1\\.1)\\.([0-9])+$')
82 DATAFILE = 'cvs2svn-data'
83 DUMPFILE = 'cvs2svn-dump' # The "dumpfile" we create to load into the repos
85 # Skeleton version of an svn filesystem.
86 # See class RepositoryMirror for how these work.
87 SVN_REVISIONS_DB = 'cvs2svn-revisions.db'
88 NODES_DB = 'cvs2svn-nodes.db'
90 # os.popen() on Windows seems to require an access-mode string of 'rb'
91 # in cases where the process will output binary information to stdout.
92 # Without the 'b' we get IOErrors upon closing the pipe. Unfortunately
93 # 'rb' isn't accepted in the Linux version of os.popen(). As a purely
94 # practical matter, we compensate by switching on os.name.
95 if os.name == 'nt':
96 PIPE_READ_MODE = 'rb'
97 PIPE_WRITE_MODE = 'wb'
98 else:
99 PIPE_READ_MODE = 'r'
100 PIPE_WRITE_MODE = 'w'
102 # Record the default RCS branches, if any, for CVS filepaths.
104 # The keys are CVS filepaths, relative to the top of the repository
105 # and with the ",v" stripped off, so they match the cvs paths used in
106 # Commit.commit(). The values are vendor branch revisions, such as
107 # '1.1.1.1', or '1.1.1.2', or '1.1.1.96'. The vendor branch revision
108 # represents the highest vendor branch revision thought to have ever
109 # been head of the default branch.
111 # The reason we record a specific vendor revision, rather than a
112 # default branch number, is that there are two cases to handle:
114 # One case is simple. The RCS file lists a default branch explicitly
115 # in its header, such as '1.1.1'. In this case, we know that every
116 # revision on the vendor branch is to be treated as head of trunk at
117 # that point in time.
119 # But there's also a degenerate case. The RCS file does not currently
120 # have a default branch, yet we can deduce that for some period in the
121 # past it probably *did* have one. For example, the file has vendor
122 # revisions 1.1.1.1 -> 1.1.1.96, all of which are dated before 1.2,
123 # and then it has 1.1.1.97 -> 1.1.1.100 dated after 1.2. In this
124 # case, we should record 1.1.1.96 as the last vendor revision to have
125 # been the head of the default branch.
126 DEFAULT_BRANCHES_DB = 'cvs2svn-default-branches.db'
128 # Records the origin ranges for branches and tags.
129 # See class RepositoryMirror for how this works.
130 SYMBOLIC_NAME_ROOTS_DB = 'cvs2svn-symroots.db'
132 # See class SymbolicNameTracker for details.
133 SYMBOLIC_NAMES_DB = "cvs2svn-sym-names.db"
135 # Records the author and log message for each changeset.
136 # The keys are author+log digests, the same kind used to identify
137 # unique revisions in the .revs, etc files. Each value is a tuple
138 # of two elements: '(author logmessage)'.
139 METADATA_DB = "cvs2svn-metadata.db"
141 REVS_SUFFIX = '.revs'
142 CLEAN_REVS_SUFFIX = '.c-revs'
143 SORTED_REVS_SUFFIX = '.s-revs'
144 RESYNC_SUFFIX = '.resync'
146 ATTIC = os.sep + 'Attic'
148 SVN_INVALID_REVNUM = -1
150 COMMIT_THRESHOLD = 5 * 60 # flush a commit if a 5 minute gap occurs
152 # Things that can happen to a file.
153 OP_NOOP = '-'
154 OP_ADD = 'A'
155 OP_DELETE = 'D'
156 OP_CHANGE = 'C'
158 # A deltatext either does or doesn't represent some change.
159 DELTATEXT_NONEMPTY = 'N'
160 DELTATEXT_EMPTY = 'E'
162 DIGEST_END_IDX = 9 + (sha.digestsize * 2)
164 # Officially, CVS symbolic names must use a fairly restricted set of
165 # characters. Unofficially, CVS 1.10 allows any character but [$,.:;@]
166 # We don't care if some repositories out there use characters outside the
167 # official set, as long as their tags start with a letter.
168 # Since the unofficial set also includes [/\] we need to translate those
169 # into ones that don't conflict with Subversion limitations.
170 symbolic_name_re = re.compile('^[a-zA-Z].*$')
171 symbolic_name_transtbl = string.maketrans('/\\',',;')
173 # A wrapper for anydbm that uses the marshal module to store items as
174 # strings.
175 class Database:
176 def __init__(self, filename, mode):
177 self.db = anydbm.open(filename, mode)
179 def has_key(self, key):
180 return self.db.has_key(key)
182 def __getitem__(self, key):
183 return marshal.loads(self.db[key])
185 def __setitem__(self, key, value):
186 self.db[key] = marshal.dumps(value)
188 def __delitem__(self, key):
189 del self.db[key]
191 class CollectData(rcsparse.Sink):
192 def __init__(self, cvsroot, log_fname_base, default_branches_db):
193 self.cvsroot = cvsroot
194 self.revs = open(log_fname_base + REVS_SUFFIX, 'w')
195 self.resync = open(log_fname_base + RESYNC_SUFFIX, 'w')
196 self.default_branches_db = default_branches_db
197 self.metadata_db = Database(METADATA_DB, 'n')
198 self.fatal_errors = []
200 # Branch and tag label types.
201 self.BRANCH_LABEL = 0
202 self.VENDOR_BRANCH_LABEL = 1
203 self.TAG_LABEL = 2
204 # A label type to string conversion list
205 self.LABEL_TYPES = [ 'branch', 'vendor branch', 'tag' ]
206 # A dict mapping label names to types
207 self.label_type = { }
209 # See set_fname() for initializations of other variables.
211 def set_fname(self, fname):
212 "Prepare to receive data for a new file."
213 self.fname = fname
215 # revision -> [timestamp, author, operation, old-timestamp]
216 self.rev_data = { }
217 self.prev = { }
219 # Hash mapping branch numbers, like '1.7.2', to branch names,
220 # like 'Release_1_0_dev'.
221 self.branch_names = { }
223 # Hash mapping revision numbers, like '1.7', to lists of names
224 # indicating which branches sprout from that revision, like
225 # ['Release_1_0_dev', 'experimental_driver', ...].
226 self.branchlist = { }
228 # Like self.branchlist, but the values are lists of tag names that
229 # apply to the key revision.
230 self.taglist = { }
232 # This is always a number -- rcsparse calls this the "principal
233 # branch", but CVS and RCS refer to it as the "default branch",
234 # so that's what we call it, even though the rcsparse API setter
235 # method is still 'set_principal_branch'.
236 self.default_branch = None
238 # If the RCS file doesn't have a default branch anymore, but does
239 # have vendor revisions, then we make an educated guess that those
240 # revisions *were* the head of the default branch up until the
241 # commit of 1.2, at which point the file's default branch became
242 # trunk. This records the date at which 1.2 was committed.
243 self.first_non_vendor_revision_date = None
245 def set_principal_branch(self, branch):
246 self.default_branch = branch
248 def set_branch_name(self, branch_number, name):
249 """Record that BRANCH_NUMBER is the branch number for branch NAME,
250 and that NAME sprouts from BRANCH_NUMBER .
251 BRANCH_NUMBER is an RCS branch number with an odd number of components,
252 for example '1.7.2' (never '1.7.0.2')."""
253 if not self.branch_names.has_key(branch_number):
254 self.branch_names[branch_number] = name
255 # The branchlist is keyed on the revision number from which the
256 # branch sprouts, so strip off the odd final component.
257 sprout_rev = branch_number[:branch_number.rfind(".")]
258 if not self.branchlist.has_key(sprout_rev):
259 self.branchlist[sprout_rev] = []
260 self.branchlist[sprout_rev].append(name)
261 else:
262 sys.stderr.write("%s: in '%s':\n"
263 " branch '%s' already has name '%s',\n"
264 " cannot also have name '%s', ignoring the latter\n"
265 % (warning_prefix, self.fname, branch_number,
266 self.branch_names[branch_number], name))
268 def rev_to_branch_name(self, revision):
269 """Return the name of the branch on which REVISION lies.
270 REVISION is a non-branch revision number with an even number of,
271 components, for example '1.7.2.1' (never '1.7.2' nor '1.7.0.2').
272 For the convenience of callers, REVISION can also be a trunk
273 revision such as '1.2', in which case just return None."""
274 if trunk_rev.match(revision):
275 return None
276 return self.branch_names.get(revision[:revision.rindex(".")])
278 def add_cvs_branch(self, revision, branch_name):
279 """Record the root revision and branch revision for BRANCH_NAME,
280 based on REVISION. REVISION is a CVS branch number having an even
281 number of components where the second-to-last is '0'. For
282 example, if it's '1.7.0.2', then record that BRANCH_NAME sprouts
283 from 1.7 and has branch number 1.7.2."""
284 last_dot = revision.rfind(".")
285 branch_rev = revision[:last_dot]
286 last2_dot = branch_rev.rfind(".")
287 branch_rev = branch_rev[:last2_dot] + revision[last_dot:]
288 self.set_branch_name(branch_rev, branch_name)
290 def get_tags(self, revision):
291 """Return a list of all tag names attached to REVISION.
292 REVISION is a regular revision number like '1.7', and the result
293 never includes branch names, only plain tags."""
294 return self.taglist.get(revision, [])
296 def get_branches(self, revision):
297 """Return a list of all branch names that sprout from REVISION.
298 REVISION is a regular revision number like '1.7'."""
299 return self.branchlist.get(revision, [])
301 def define_tag(self, name, revision):
302 """Record a bidirectional mapping between symbolic NAME and REVISION.
303 REVISION is an unprocessed revision number from the RCS file's
304 header, for example: '1.7', '1.7.0.2', or '1.1.1' or '1.1.1.1'.
305 This function will determine what kind of symbolic name it is by
306 inspection, and record it in the right places."""
307 if not symbolic_name_re.match(name):
308 sys.stderr.write("%s: in '%s':\n"
309 " '%s' is not a valid tag or branch name, ignoring\n"
310 % (warning_prefix, self.fname, name))
311 elif branch_tag.match(revision):
312 label_type = self.BRANCH_LABEL
313 self.add_cvs_branch(revision, name)
314 elif vendor_tag.match(revision):
315 label_type = self.VENDOR_BRANCH_LABEL
316 self.set_branch_name(revision, name)
317 else:
318 label_type = self.TAG_LABEL
319 if not self.taglist.has_key(revision):
320 self.taglist[revision] = []
321 self.taglist[revision].append(name)
323 try:
324 # if label_types are different and at least one is a tag (We
325 # don't want to error on branch/vendor branch mismatches)
326 if (self.label_type[name] != label_type
327 and(self.label_type[name] == self.TAG_LABEL
328 or label_type == self.TAG_LABEL)):
329 err = ("%s: in '%s' (BRANCH/TAG MISMATCH):\n '%s' "
330 " is defined as %s here, but as a %s elsewhere"
331 % (error_prefix, self.fname, name,
332 self.LABEL_TYPES[label_type],
333 self.LABEL_TYPES[self.label_type[name]]))
334 sys.stderr.write(err)
335 self.fatal_errors.append(err)
336 except KeyError:
337 self.label_type[name] = label_type
339 def define_revision(self, revision, timestamp, author, state,
340 branches, next):
341 ### what else?
342 if state == 'dead':
343 op = OP_DELETE
344 else:
345 op = OP_CHANGE
347 # store the rev_data as a list in case we have to jigger the timestamp
348 self.rev_data[revision] = [int(timestamp), author, op, None]
350 # record the previous revision for sanity checking later
351 if trunk_rev.match(revision):
352 self.prev[revision] = next
353 elif next:
354 self.prev[next] = revision
355 for b in branches:
356 self.prev[b] = revision
358 # Ratchet up the highest vendor head revision, if necessary.
359 if self.default_branch:
360 if revision.find(self.default_branch) == 0:
361 # This revision is on the default branch, so record that it is
362 # the new highest vendor head revision.
363 rel_name = relative_name(self.cvsroot, self.fname)[:-2]
364 self.default_branches_db[rel_name] = revision
365 else:
366 # No default branch, so make an educated guess.
367 if revision == '1.2':
368 # This is probably the time when the file stopped having a
369 # default branch, so make a note of it.
370 self.first_non_vendor_revision_date = timestamp
371 else:
372 m = vendor_revision.match(revision)
373 if m and ((not self.first_non_vendor_revision_date)
374 or (timestamp < self.first_non_vendor_revision_date)):
375 # We're looking at a vendor revision, and it wasn't
376 # committed after this file lost its default branch, so bump
377 # the maximum trunk vendor revision in the permanent record.
378 rel_name = relative_name(self.cvsroot, self.fname)[:-2]
379 self.default_branches_db[rel_name] = revision
381 # Check for unlabeled branches, record them. We tried to collect
382 # all branch names when we parsed the symbolic name header
383 # earlier, of course, but that didn't catch unlabeled branches.
384 # If a branch is unlabeled, this is our first encounter with it,
385 # so we have to record its data now.
386 if not trunk_rev.match(revision):
387 branch_number = revision[:revision.rindex(".")]
388 branch_name = "unlabeled-" + branch_number
389 if not self.branch_names.has_key(branch_number):
390 self.set_branch_name(branch_number, branch_name)
392 def tree_completed(self):
393 "The revision tree has been parsed. Analyze it for consistency."
395 # Our algorithm depends upon the timestamps on the revisions occuring
396 # monotonically over time. That is, we want to see rev 1.34 occur in
397 # time before rev 1.35. If we inserted 1.35 *first* (due to the time-
398 # sorting), and then tried to insert 1.34, we'd be screwed.
400 # to perform the analysis, we'll simply visit all of the 'previous'
401 # links that we have recorded and validate that the timestamp on the
402 # previous revision is before the specified revision
404 # if we have to resync some nodes, then we restart the scan. just keep
405 # looping as long as we need to restart.
406 while 1:
407 for current, prev in self.prev.items():
408 if not prev:
409 # no previous revision exists (i.e. the initial revision)
410 continue
411 t_c = self.rev_data[current][0]
412 t_p = self.rev_data[prev][0]
413 if t_p >= t_c:
414 # the previous revision occurred later than the current revision.
415 # shove the previous revision back in time (and any before it that
416 # may need to shift).
417 while t_p >= t_c:
418 self.rev_data[prev][0] = t_c - 1 # new timestamp
419 self.rev_data[prev][3] = t_p # old timestamp
421 print "RESYNC: '%s' (%s) : old time='%s' new time='%s'" \
422 % (relative_name(self.cvsroot, self.fname),
423 prev, time.ctime(t_p), time.ctime(t_c - 1))
425 current = prev
426 prev = self.prev[current]
427 if not prev:
428 break
429 t_c = t_c - 1 # self.rev_data[current][0]
430 t_p = self.rev_data[prev][0]
432 # break from the for-loop
433 break
434 else:
435 # finished the for-loop (no resyncing was performed)
436 return
438 def set_revision_info(self, revision, log, text):
439 timestamp, author, op, old_ts = self.rev_data[revision]
440 digest = sha.new(log + '\0' + author).hexdigest()
441 if old_ts:
442 # the timestamp on this revision was changed. log it for later
443 # resynchronization of other files's revisions that occurred
444 # for this time and log message.
445 self.resync.write('%08lx %s %08lx\n' % (old_ts, digest, timestamp))
447 # "...Give back one kadam to honor the Hebrew God whose Ark this is."
448 # -- Imam to Indy and Sallah, in 'Raiders of the Lost Ark'
450 # If revision 1.1 appears to have been created via 'cvs add'
451 # instead of 'cvs import', then this file probably never had a
452 # default branch, so retroactively remove its record in the
453 # default branches db. The test is that the log message CVS uses
454 # for 1.1 in imports is "Initial revision\n" with no period.
455 if revision == '1.1' and log != 'Initial revision\n':
456 rel_name = relative_name(self.cvsroot, self.fname)[:-2]
457 if self.default_branches_db.has_key(rel_name):
458 del self.default_branches_db[rel_name]
460 if text:
461 deltatext_code = DELTATEXT_NONEMPTY
462 else:
463 deltatext_code = DELTATEXT_EMPTY
465 write_revs_line(self.revs, timestamp, digest, op, revision,
466 deltatext_code, self.fname,
467 self.rev_to_branch_name(revision),
468 self.get_tags(revision),
469 self.get_branches(revision))
471 if not self.metadata_db.has_key(digest):
472 self.metadata_db[digest] = (author, log)
474 def run_command(command):
475 if os.system(command):
476 sys.exit('Command failed: "%s"' % command)
478 def make_path(ctx, path, branch_name = None, tag_name = None):
479 """Return the trunk path, branch path, or tag path for PATH.
480 CTX holds the name of the branches or tags directory, which is
481 prepended to PATH when constructing a branch or tag path.
483 If PATH is empty or None, return the root trunk|branch|tag path.
485 It is an error to pass both a BRANCH_NAME and a TAG_NAME."""
487 # For a while, we treated each top-level subdir of the CVS
488 # repository as a "project root" and interpolated the appropriate
489 # genealogy (trunk|tag|branch) in according to the official
490 # recommended layout. For example, the path '/foo/bar/baz.c' on
491 # branch 'Rel2' would become
493 # /foo/branches/Rel2/bar/baz.c
495 # and on trunk it would become
497 # /foo/trunk/bar/baz.c
499 # However, we went back to the older and simpler method of just
500 # prepending the genealogy to the front, instead of interpolating.
501 # So now we produce:
503 # /branches/Rel2/foo/bar/baz.c
504 # /trunk/foo/bar/baz.c
506 # Why? Well, Jack Repenning pointed out that this way is much
507 # friendlier to "anonymously rooted subtrees" (that's a tree where
508 # the name of the top level dir doesn't matter, the point is that if
509 # you cd into it and, say, run 'make', something good will happen).
510 # By interpolating, we made it impossible to point cvs2svn at some
511 # subdir in the CVS repository and convert it as a project, because
512 # we'd treat every subdir underneath it as an independent project
513 # root, which is probably not what the user wanted.
515 # Also, see Blair Zajac's post
517 # http://subversion.tigris.org/servlets/ReadMsg?list=dev&msgNo=38965
519 # and the surrounding thread, for why what people really want is a
520 # way of specifying an in-repository prefix path, not interpolation.
522 # Check caller sanity.
523 if branch_name and tag_name:
524 sys.stderr.write("%s: make_path() miscalled: both branch and tag given.\n"
525 % error_prefix)
526 sys.exit(1)
528 if branch_name:
529 branch_name = branch_name.translate(symbolic_name_transtbl)
530 if path:
531 return ctx.branches_base + '/' + branch_name + '/' + path
532 else:
533 return ctx.branches_base + '/' + branch_name
534 elif tag_name:
535 tag_name = tag_name.translate(symbolic_name_transtbl)
536 if path:
537 return ctx.tags_base + '/' + tag_name + '/' + path
538 else:
539 return ctx.tags_base + '/' + tag_name
540 else:
541 if path:
542 return ctx.trunk_base + '/' + path
543 else:
544 return ctx.trunk_base
547 def relative_name(cvsroot, fname):
548 l = len(cvsroot)
549 if fname[:l] == cvsroot:
550 if fname[l] == os.sep:
551 return string.replace(fname[l+1:], os.sep, '/')
552 return string.replace(fname[l:], os.sep, '/')
553 sys.stderr.write("%s: relative_path('%s', '%s'): fname is not a sub-path of"
554 " cvsroot\n" % (error_prefix, cvsroot, fname))
555 sys.exit(1)
558 def visit_file(arg, dirname, files):
559 cd, p, stats = arg
560 for fname in files:
561 if fname[-2:] != ',v':
562 continue
563 pathname = os.path.join(dirname, fname)
564 if dirname[-6:] == ATTIC:
565 # drop the 'Attic' portion from the pathname
566 ### we should record this so we can easily insert it back in
567 cd.set_fname(os.path.join(dirname[:-6], fname))
568 else:
569 cd.set_fname(pathname)
570 print pathname
571 try:
572 p.parse(open(pathname, 'rb'), cd)
573 stats[0] = stats[0] + 1
574 except (rcsparse.common.RCSParseError, ValueError, RuntimeError):
575 print "%s: '%s' is not a valid ,v file, ignoring" \
576 % (warning_prefix, pathname)
577 except:
578 print "Exception occurred while parsing %s" % pathname
579 raise
582 # Return a string that has not been returned by gen_key() before.
583 gen_key_base = 0L
584 def gen_key():
585 global gen_key_base
586 key = '%x' % gen_key_base
587 gen_key_base = gen_key_base + 1
588 return key
591 class Change:
592 """Class for recording what actually happened when a change is made,
593 because not all of the result is guessable by the caller.
594 See RepositoryMirror.change_path() for more.
596 The fields are
599 OP_ADD path was added, OP_CHANGE if changed, or OP_NOOP if no
600 action.
602 closed_tags:
603 List of tags that this path can no longer be the source of,
604 that is, tags which could be rooted in the path before the
605 change, but not after.
607 closed_branches:
608 Like closed_tags, but for branches.
610 deleted_entries:
611 The list of entries deleted from the destination after
612 copying a directory, or None.
614 copyfrom_rev:
615 The actual revision from which the path was copied, which
616 may be one less than the requested revision when the path
617 was deleted in the requested revision, or None."""
618 def __init__(self, op, closed_tags, closed_branches,
619 deleted_entries=None, copyfrom_rev=None):
620 self.op = op
621 self.closed_tags = closed_tags
622 self.closed_branches = closed_branches
623 self.deleted_entries = deleted_entries
624 self.copyfrom_rev = copyfrom_rev
627 class RepositoryMirror:
628 def __init__(self):
629 # This corresponds to the 'revisions' table in a Subversion fs.
630 self.revs_db_file = SVN_REVISIONS_DB
631 self.revs_db = Database(self.revs_db_file, 'n')
633 # This corresponds to the 'nodes' table in a Subversion fs. (We
634 # don't need a 'representations' or 'strings' table because we
635 # only track metadata, not file contents.)
636 self.nodes_db_file = NODES_DB
637 self.nodes_db = Database(self.nodes_db_file, 'n')
639 # This tracks which symbolic names the current "head" of a given
640 # filepath could be the origin node for. When the next commit on
641 # that path comes along, we can tell which symbolic names
642 # originated in the previous version, and signal back to the
643 # caller that the file can no longer be the origin for those names.
645 # The values are tuples, (tags, branches), where each value is a
646 # list.
647 self.symroots_db_file = SYMBOLIC_NAME_ROOTS_DB
648 self.symroots_db = Database(self.symroots_db_file, 'n')
650 # When copying a directory (say, to create part of a branch), we
651 # pass change_path() a list of expected entries, so it can remove
652 # any that are in the source but don't belong on the branch.
653 # However, because creating a given region of a branch can involve
654 # copying from several sources, we don't want later copy
655 # operations to delete entries that were legitimately created by
656 # earlier copy ops. So after a copy, the directory records
657 # legitimate entries under this key, in a dictionary (the keys are
658 # entry names, the values can be ignored).
659 self.approved_entries = "/approved-entries"
661 # Set on a directory that's mutable in the revision currently
662 # being constructed. (Yes, this is exactly analogous to
663 # the Subversion filesystem code's concept of mutability.)
664 self.mutable_flag = "/mutable"
665 # This could represent a new mutable directory or file.
666 self.empty_mutable_thang = { self.mutable_flag : 1 }
668 # Init a root directory with no entries at revision 0.
669 self.youngest = 0
670 youngest_key = gen_key()
671 self.revs_db[str(self.youngest)] = youngest_key
672 self.nodes_db[youngest_key] = {}
674 def new_revision(self):
675 """Stabilize the current revision, then start the next one.
676 (Increments youngest.)"""
677 self.stabilize_youngest()
678 self.revs_db[str(self.youngest + 1)] \
679 = self.revs_db[str(self.youngest)]
680 self.youngest = self.youngest + 1
682 def _stabilize_directory(self, key):
683 """Close the directory whose node key is KEY."""
684 dir = self.nodes_db[key]
685 if dir.has_key(self.mutable_flag):
686 del dir[self.mutable_flag]
687 if dir.has_key(self.approved_entries):
688 del dir[self.approved_entries]
689 for entry_key in dir.keys():
690 if not entry_key[0] == '/':
691 self._stabilize_directory(dir[entry_key])
692 self.nodes_db[key] = dir
694 def stabilize_youngest(self):
695 """Stabilize the current revision by removing mutable flags."""
696 root_key = self.revs_db[str(self.youngest)]
697 self._stabilize_directory(root_key)
699 def probe_path(self, path, revision=-1, debugging=None):
700 """If PATH exists in REVISION of the svn repository mirror,
701 return its leaf value, else return None.
702 If DEBUGGING is true, then print trace output to stdout.
703 REVISION defaults to youngest, and PATH must not start with '/'."""
704 components = string.split(path, '/')
705 if revision == -1:
706 revision = self.youngest
708 if debugging:
709 print "PROBING path: '%s' in %d" % (path, revision)
711 parent_key = self.revs_db[str(revision)]
712 parent = self.nodes_db[parent_key]
713 previous_component = "/"
715 i = 1
716 for component in components:
718 if debugging:
719 print " " * i,
720 print "'%s' key: %s, val:" % (previous_component, parent_key), parent
722 if not parent.has_key(component):
723 if debugging:
724 print " PROBE ABANDONED: '%s' does not contain '%s'" \
725 % (previous_component, component)
726 return None
728 this_entry_key = parent[component]
729 this_entry_val = self.nodes_db[this_entry_key]
730 parent_key = this_entry_key
731 parent = this_entry_val
732 previous_component = component
733 i = i + 1
735 if debugging:
736 print " " * i,
737 print "parent_key: %s, val:" % parent_key, parent
739 # It's not actually a parent at this point, it's the leaf node.
740 return parent
742 def change_path(self, path, tags, branches,
743 intermediate_dir_func=None,
744 copyfrom_path=None, copyfrom_rev=None,
745 expected_entries=None, only_if_already_exists=None):
746 """Record a change to PATH. PATH may not have a leading slash.
747 Return a Change instance representing the result of the
748 change.
750 TAGS are any tags that sprout from this revision of PATH, BRANCHES
751 are any branches that sprout from this revision of PATH.
753 If INTERMEDIATE_DIR_FUNC is not None, then invoke it once on
754 each full path to each missing intermediate directory in PATH, in
755 order from shortest to longest.
757 If COPYFROM_REV and COPYFROM_PATH are not None, then they are a
758 revision and path to record as the copyfrom sources of this node.
759 Since this implies an add (OP_ADD), it would be reasonable to
760 error and exit if the copyfrom args are present but the node also
761 already exists. Reasonable -- but not what we do :-). The most
762 useful behavior for callers is instead to report that nothing was
763 done, by returning OP_NOOP for Change.op, so that's what we do.
765 It is an error for only one copyfrom argument to be present.
767 If EXPECTED_ENTRIES is not None, then it holds entries expected
768 to be in the dst after the copy. Any entries in the new dst but
769 not in EXPECTED_ENTRIES are removed (ignoring keys beginning with
770 '/'), and the removed entries returned in Change.deleted_entries,
771 which are otherwise None.
773 No action is taken for keys in EXPECTED_ENTRIES but not in the
774 dst; it is assumed that the caller will compensate for these by
775 calling change_path again with other arguments.
777 If ONLY_IF_ALREADY_EXISTS is set, then do a no-op, rather than an add,
778 if the path does not exist. This is to allow pruning using EXPECTED_ENTRIES
779 without risking erroneously adding a path."""
781 # Check caller sanity.
782 if ((copyfrom_rev and not copyfrom_path) or
783 (copyfrom_path and not copyfrom_rev)):
784 sys.stderr.write("%s: change_path() called with one copyfrom "
785 "argument but not the other.\n" % error_prefix)
786 sys.exit(1)
788 components = string.split(path, '/')
789 path_so_far = None
791 deletions = []
793 parent_key = self.revs_db[str(self.youngest)]
794 parent = self.nodes_db[parent_key]
795 if not parent.has_key(self.mutable_flag):
796 parent_key = gen_key()
797 parent[self.mutable_flag] = 1
798 self.nodes_db[parent_key] = parent
799 self.revs_db[str(self.youngest)] = parent_key
801 for component in components[:-1]:
802 # parent is always mutable at the top of the loop
804 if path_so_far:
805 path_so_far = path_so_far + '/' + component
806 else:
807 path_so_far = component
809 # Ensure that the parent has an entry for this component.
810 if not parent.has_key(component):
811 if only_if_already_exists:
812 return Change(OP_NOOP, [], [], deletions)
813 # else
814 new_child_key = gen_key()
815 parent[component] = new_child_key
816 self.nodes_db[new_child_key] = self.empty_mutable_thang
817 self.nodes_db[parent_key] = parent
818 if intermediate_dir_func:
819 intermediate_dir_func(path_so_far)
821 # One way or another, parent dir now has an entry for component,
822 # so grab it, see if it's mutable, and DTRT if it's not. (Note
823 # it's important to reread the entry value from the db, even
824 # though we might have just written it -- if we tweak existing
825 # data structures, we could modify self.empty_mutable_thang,
826 # which must not happen.)
827 this_entry_key = parent[component]
828 this_entry_val = self.nodes_db[this_entry_key]
829 if not this_entry_val.has_key(self.mutable_flag):
830 this_entry_val[self.mutable_flag] = 1
831 this_entry_key = gen_key()
832 parent[component] = this_entry_key
833 self.nodes_db[this_entry_key] = this_entry_val
834 self.nodes_db[parent_key] = parent
836 parent_key = this_entry_key
837 parent = this_entry_val
839 # Now change the last node, the versioned file. Just like at the
840 # top of the above loop, parent is already mutable.
841 op = OP_ADD
842 if self.symroots_db.has_key(path):
843 old_names = self.symroots_db[path]
844 else:
845 old_names = [], []
846 last_component = components[-1]
847 new_val = { }
848 if parent.has_key(last_component):
849 # The contract for copying over existing nodes is to do nothing
850 # and return:
851 if copyfrom_path:
852 return Change(OP_NOOP, old_names[0], old_names[1], deletions)
853 # else
854 op = OP_CHANGE
855 new_val = self.nodes_db[parent[last_component]]
856 elif only_if_already_exists:
857 return Change(OP_NOOP, [], [], deletions)
859 leaf_key = gen_key()
860 if copyfrom_path:
861 new_val = self.probe_path(copyfrom_path, copyfrom_rev)
862 if new_val is None:
863 # Sometimes a branch is rooted in a revision that RCS has
864 # marked as 'dead'. There is no reason to assume that the
865 # current path shares any history with any older live parent
866 # of the dead revision, so we do nothing and return.
867 return Change(OP_NOOP, [], [], deletions)
868 if expected_entries is not None:
869 # If it is not None, then even if it is an empty list/tuple,
870 # we need to approve this item in its parent's approved entries list.
871 approved_entries = parent.get(self.approved_entries) or {}
872 approved_entries[last_component] = 1
873 parent[self.approved_entries] = approved_entries
874 if expected_entries:
875 approved_entries = new_val.get(self.approved_entries) or { }
876 new_approved_entries = { }
877 for ent in new_val.keys():
878 if (ent[0] != '/'):
879 if (not expected_entries.has_key(ent)
880 and not approved_entries.has_key(ent)):
881 del new_val[ent]
882 deletions.append(ent)
883 else:
884 new_approved_entries[ent] = 1
885 new_val[self.approved_entries] = new_approved_entries
886 parent[last_component] = leaf_key
887 self.nodes_db[parent_key] = parent
888 self.symroots_db[path] = (tags, branches)
889 new_val[self.mutable_flag] = 1
890 self.nodes_db[leaf_key] = new_val
892 return Change(op, old_names[0], old_names[1], deletions, copyfrom_rev)
894 def delete_path(self, path, tags, branches, prune=None):
895 """Delete PATH from the tree. PATH may not have a leading slash.
897 Return a tuple (path_deleted, closed_tags, closed_branches), where
898 path_deleted is the path actually deleted or None if PATH did not
899 exist, and closed_tags and closed_branches are lists of symbolic
900 names closed off by this deletion -- that is, tags or branches
901 which could be rooted in the previous revision of PATH, but not in
902 this revision, because this rev changes PATH. If path_deleted is
903 None, then closed_tags and closed_branches will both be empty.
905 TAGS are any tags that sprout from this revision of PATH, BRANCHES
906 are any branches that sprout from this revision of PATH. (I can't
907 imagine that there are any of either, what to do if there are?)
909 If PRUNE is not None, then delete the highest possible directory,
910 which means the returned path may differ from PATH. In other
911 words, if PATH was the last entry in its parent, then delete
912 PATH's parent, unless it too is the last entry in *its* parent, in
913 which case delete that parent, and so on up the chain, until a
914 directory is encountered that has an entry which is not a member
915 of the parent stack of the original target.
917 NOTE: This function does *not* allow you delete top-level entries
918 (like /trunk, /branches, /tags), not does it prune upwards beyond
919 those entries.
921 PRUNE is like the -P option to 'cvs checkout'."""
923 components = string.split(path, '/')
924 path_so_far = None
926 parent_key = self.revs_db[str(self.youngest)]
927 parent = self.nodes_db[parent_key]
929 # As we walk down to find the dest, we remember each parent
930 # directory's name and db key, in reverse order: push each new key
931 # onto the front of the list, so that by the time we reach the
932 # destination node, the zeroth item in the list is the parent of
933 # that destination.
935 # Then if we actually do the deletion, we walk the list from left
936 # to right, replacing as appropriate.
938 # The root directory has name None.
939 parent_chain = [ ]
940 parent_chain.insert(0, (None, parent_key))
942 def is_prunable(dir):
943 """Return true if DIR, a dictionary representing a directory,
944 has just zero or one non-special entry, else return false.
945 (In a pure world, we'd just ask len(DIR) > 1; it's only
946 because the directory might have mutable flags and other special
947 entries that we need this function at all.)"""
948 num_items = len(dir)
949 if num_items > 3:
950 return None
951 if num_items == 3 or num_items == 2:
952 real_entries = 0
953 for key in dir.keys():
954 if not key[0] == '/': real_entries = real_entries + 1
955 if real_entries > 1:
956 return None
957 else:
958 return 1
959 else:
960 return 1
962 # We never prune our top-level directories (/trunk, /tags, /branches)
963 if len(components) < 2:
964 return None, [], []
966 for component in components[:-1]:
967 if path_so_far:
968 path_so_far = path_so_far + '/' + component
969 else:
970 path_so_far = component
972 # If we can't reach the dest, then we don't need to do anything.
973 if not parent.has_key(component):
974 return None, [], []
976 # Otherwise continue downward, dropping breadcrumbs.
977 this_entry_key = parent[component]
978 this_entry_val = self.nodes_db[this_entry_key]
979 parent_key = this_entry_key
980 parent = this_entry_val
981 parent_chain.insert(0, (component, parent_key))
983 # If the target is not present in its parent, then we're done.
984 last_component = components[-1]
985 old_names = [], []
986 if not parent.has_key(last_component):
987 return None, [], []
988 elif self.symroots_db.has_key(path):
989 old_names = self.symroots_db[path]
990 del self.symroots_db[path]
992 # The target is present, so remove it and bubble up, making a new
993 # mutable path and/or pruning as necessary.
994 pruned_count = 0
995 prev_entry_name = last_component
996 new_key = None
997 for parent_item in parent_chain:
998 pkey = parent_item[1]
999 pval = self.nodes_db[pkey]
1001 # If we're pruning at all, and we're looking at a prunable thing
1002 # (and that thing isn't one of our top-level directories --
1003 # trunk, tags, branches) ...
1004 if prune and (new_key is None) and is_prunable(pval) \
1005 and parent_item != parent_chain[-2]:
1006 # ... then up our count of pruned items, and do nothing more.
1007 # All the action takes place when we hit a non-prunable
1008 # parent.
1009 pruned_count = pruned_count + 1
1010 else:
1011 # Else, we've hit a non-prunable, or aren't pruning, so bubble
1012 # up the new gospel.
1013 pval[self.mutable_flag] = 1
1014 if new_key is None:
1015 del pval[prev_entry_name]
1016 else:
1017 pval[prev_entry_name] = new_key
1018 new_key = gen_key()
1020 prev_entry_name = parent_item[0]
1021 if new_key:
1022 self.nodes_db[new_key] = pval
1024 if new_key is None:
1025 new_key = gen_key()
1026 self.nodes_db[new_key] = self.empty_mutable_thang
1028 # Install the new root entry.
1029 self.revs_db[str(self.youngest)] = new_key
1031 # Sanity check -- this should be a "can't happen".
1032 if pruned_count > len(components):
1033 sys.stderr.write("%s: deleting '%s' tried to prune %d components.\n"
1034 % (error_prefix, path, pruned_count))
1035 sys.exit(1)
1037 if pruned_count:
1038 if pruned_count == len(components):
1039 # We never prune away the root directory, so back up one component.
1040 pruned_count = pruned_count - 1
1041 retpath = string.join(components[:0 - pruned_count], '/')
1042 else:
1043 retpath = path
1045 return retpath, old_names[0], old_names[1]
1047 ### We've no place to put tags + branches. Suspect we just
1048 ### shouldn't be taking them as arguments, which the doc string
1049 ### implies already. Ponder.
1051 def close(self):
1052 # Just stabilize the last revision. This may or may not affect
1053 # anything, but if we end up using the mirror for anything after
1054 # this, it's nice to know the '/mutable' entries are gone.
1055 self.stabilize_youngest()
1057 if sys.platform == "win32":
1058 def escape_shell_arg(str):
1059 return '"' + string.replace(str, '"', '"^""') + '"'
1060 else:
1061 def escape_shell_arg(str):
1062 return "'" + string.replace(str, "'", "'\\''") + "'"
1064 class Dumper:
1065 def __init__(self, ctx):
1066 'Open DUMPFILE_PATH, and initialize revision to REVISION.'
1067 self.dumpfile_path = ctx.dumpfile
1068 self.revision = 0
1069 self.repos_mirror = RepositoryMirror()
1070 self.svnadmin = ctx.svnadmin
1071 self.target = ctx.target
1072 self.dump_only = ctx.dump_only
1073 self.dumpfile = None
1074 self.path_encoding = ctx.encoding
1075 self.loader_pipe = None
1077 # If all we're doing here is dumping, we can go ahead and
1078 # initialize our single dumpfile. Else, if we're suppose to
1079 # create the repository, do so.
1080 if self.dump_only:
1081 self.init_dumpfile()
1082 self.write_dumpfile_header(self.dumpfile)
1083 else:
1084 if not ctx.existing_svnrepos:
1085 print "creating repos '%s'" % (self.target)
1086 run_command('%s create %s %s' % (self.svnadmin, ctx.bdb_txn_nosync
1087 and "--bdb-txn-nosync" or "", self.target))
1088 self.loader_pipe = os.popen('%s load -q %s' %
1089 (self.svnadmin, self.target), PIPE_WRITE_MODE)
1090 self.write_dumpfile_header(self.loader_pipe)
1093 def init_dumpfile(self):
1094 # Open the dumpfile for binary-mode write.
1095 self.dumpfile = open(self.dumpfile_path, 'wb')
1098 def write_dumpfile_header(self, fileobj):
1099 # Initialize the dumpfile with the standard headers:
1101 # The CVS repository doesn't have a UUID, and the Subversion
1102 # repository will be created with one anyway. So when we load
1103 # the dumpfile, we don't specify a UUID.
1104 fileobj.write('SVN-fs-dump-format-version: 2\n\n')
1106 def flush_and_remove_dumpfile(self):
1107 if self.dumpfile is None:
1108 return
1109 self.dumpfile.close()
1110 print "piping revision %d into '%s' loader" % (self.revision, self.target)
1111 dumpfile = open(self.dumpfile_path, 'rb')
1112 while 1:
1113 data = dumpfile.read(1024*1024) # Choice of 1MB chunks is arbitrary
1114 if not len(data): break
1115 self.loader_pipe.write(data)
1116 dumpfile.close()
1118 os.remove(self.dumpfile_path)
1120 def start_revision(self, props):
1121 """Write the next revision, with properties, to the dumpfile.
1122 Return the newly started revision."""
1124 # If this is not a --dump-only, we need to flush (load into the
1125 # repository) any dumpfile data we have already written and the
1126 # init a new dumpfile before starting this revision.
1128 if not self.dump_only:
1129 if self.revision > 0:
1130 self.flush_and_remove_dumpfile()
1131 self.init_dumpfile()
1133 self.revision = self.revision + 1
1135 # A revision typically looks like this:
1137 # Revision-number: 1
1138 # Prop-content-length: 129
1139 # Content-length: 129
1141 # K 7
1142 # svn:log
1143 # V 27
1144 # Log message for revision 1.
1145 # K 10
1146 # svn:author
1147 # V 7
1148 # jrandom
1149 # K 8
1150 # svn:date
1151 # V 27
1152 # 2003-04-22T22:57:58.132837Z
1153 # PROPS-END
1155 # Notice that the length headers count everything -- not just the
1156 # length of the data but also the lengths of the lengths, including
1157 # the 'K ' or 'V ' prefixes.
1159 # The reason there are both Prop-content-length and Content-length
1160 # is that the former includes just props, while the latter includes
1161 # everything. That's the generic header form for any entity in a
1162 # dumpfile. But since revisions only have props, the two lengths
1163 # are always the same for revisions.
1165 # Calculate the total length of the props section.
1166 total_len = 10 # len('PROPS-END\n')
1167 for propname in props.keys():
1168 klen = len(propname)
1169 klen_len = len('K %d' % klen)
1170 vlen = len(props[propname])
1171 vlen_len = len('V %d' % vlen)
1172 # + 4 for the four newlines within a given property's section
1173 total_len = total_len + klen + klen_len + vlen + vlen_len + 4
1175 # Print the revision header and props
1176 self.dumpfile.write('Revision-number: %d\n'
1177 'Prop-content-length: %d\n'
1178 'Content-length: %d\n'
1179 '\n'
1180 % (self.revision, total_len, total_len))
1182 for propname in props.keys():
1183 self.dumpfile.write('K %d\n'
1184 '%s\n'
1185 'V %d\n'
1186 '%s\n' % (len(propname),
1187 propname,
1188 len(props[propname]),
1189 props[propname]))
1191 self.dumpfile.write('PROPS-END\n')
1192 self.dumpfile.write('\n')
1194 self.repos_mirror.new_revision()
1195 return self.revision
1197 def add_dir(self, path):
1198 self.dumpfile.write("Node-path: %s\n"
1199 "Node-kind: dir\n"
1200 "Node-action: add\n"
1201 "Prop-content-length: 10\n"
1202 "Content-length: 10\n"
1203 "\n"
1204 "PROPS-END\n"
1205 "\n"
1206 "\n" % self.utf8_path(path))
1208 def utf8_path(self, path):
1209 """Return UTF-8 encoded 'path' based on ctx.path_encoding."""
1210 try:
1211 ### Log messages can be converted with 'replace' strategy.
1212 ### We can't afford that here.
1213 unicode_path = unicode(path, self.path_encoding, 'strict')
1214 return unicode_path.encode('utf-8')
1216 except UnicodeError:
1217 print "Unable to convert a path '%s' to internal encoding." % path
1218 print "Consider rerunning with (for example) '--encoding=latin1'"
1219 sys.exit(1)
1222 def probe_path(self, path):
1223 """Return true if PATH exists in the youngest tree of the svn
1224 repository, else return None. PATH does not start with '/'."""
1225 if self.repos_mirror.probe_path(path) is None:
1226 return None
1227 else:
1228 return 1
1230 def copy_path(self, svn_src_path, svn_src_rev, svn_dst_path, entries=None):
1231 """If it wouldn't be redundant to do so, emit a copy of SVN_SRC_PATH at
1232 SVN_SRC_REV to SVN_DST_PATH.
1234 Return 1 if the copy was done, None otherwise.
1236 If ENTRIES is not None, it is a dictionary whose keys are the full
1237 set of entries the new copy is expected to have -- and therefore
1238 any entries in the new dst but not in ENTRIES will be removed.
1239 (Keys in ENTRIES beginning with '/' are ignored.)
1241 No action is taken for keys in ENTRIES but not in the dst; it is
1242 assumed that the caller will compensate for these by calling
1243 copy_path again with other arguments."""
1244 change = self.repos_mirror.change_path(svn_dst_path,
1245 [], [],
1246 self.add_dir,
1247 svn_src_path, svn_src_rev,
1248 entries)
1249 if change.op == OP_ADD:
1250 if change.copyfrom_rev >= self.revision:
1251 sys.stderr.write("%s: invalid copyfrom revision %d used while\n"
1252 "creating revision %d in dumpfile.\n"
1253 % (error_prefix, change.copyfrom_rev, self.revision))
1254 sys.exit(1)
1256 # We don't need to include "Node-kind:" for copies; the loader
1257 # ignores it anyway and just uses the source kind instead.
1258 self.dumpfile.write('Node-path: %s\n'
1259 'Node-action: add\n'
1260 'Node-copyfrom-rev: %d\n'
1261 'Node-copyfrom-path: /%s\n'
1262 '\n'
1263 % (self.utf8_path(svn_dst_path),
1264 change.copyfrom_rev,
1265 self.utf8_path(svn_src_path)))
1267 for ent in change.deleted_entries:
1268 self.dumpfile.write('Node-path: %s\n'
1269 'Node-action: delete\n'
1270 '\n' % (self.utf8_path(svn_dst_path + '/' + ent)))
1271 return 1
1272 return None
1274 def prune_entries(self, path, expected):
1275 """Delete any entries in PATH that are not in list EXPECTED.
1276 PATH need not be a directory, but of course nothing will happen if
1277 it's a file. Entries beginning with '/' are ignored as usual."""
1278 change = self.repos_mirror.change_path(path,
1279 [], [],
1280 self.add_dir,
1281 None, None,
1282 expected, 1)
1283 for ent in change.deleted_entries:
1284 self.dumpfile.write('Node-path: %s\n'
1285 'Node-action: delete\n'
1286 '\n' % (self.utf8_path(path + '/' + ent)))
1288 def add_or_change_path(self, cvs_path, svn_path, cvs_rev, rcs_file,
1289 tags, branches, cvs_revnums):
1291 # figure out the real file path for "co"
1292 try:
1293 f_st = os.stat(rcs_file)
1294 except os.error:
1295 dirname, fname = os.path.split(rcs_file)
1296 rcs_file = os.path.join(dirname, 'Attic', fname)
1297 f_st = os.stat(rcs_file)
1299 # We begin with only a "CVS revision" property.
1300 if cvs_revnums:
1301 prop_contents = 'K 15\ncvs2svn:cvs-rev\nV %d\n%s\n' \
1302 % (len(cvs_rev), cvs_rev)
1303 else:
1304 prop_contents = ''
1306 # Check for executable-ness.
1307 if f_st[0] & stat.S_IXUSR:
1308 prop_contents = prop_contents + 'K 14\nsvn:executable\nV 1\n*\n'
1310 # Calculate the property length (+10 for "PROPS-END\n")
1311 props_len = len(prop_contents) + 10
1313 ### FIXME: We ought to notice the -kb flag set on the RCS file and
1314 ### use it to set svn:mime-type.
1316 basename = os.path.basename(rcs_file[:-2])
1317 pipe_cmd = 'co -q -x,v -p%s %s' % (cvs_rev, escape_shell_arg(rcs_file))
1318 pipe = os.popen(pipe_cmd, PIPE_READ_MODE)
1320 # You might think we could just test
1322 # if cvs_rev[-2:] == '.1':
1324 # to determine if this path exists in head yet. But that wouldn't
1325 # be perfectly reliable, both because of 'cvs commit -r', and also
1326 # the possibility of file resurrection.
1327 change = self.repos_mirror.change_path(svn_path, tags, branches,
1328 self.add_dir)
1330 if change.op == OP_ADD:
1331 action = 'add'
1332 else:
1333 action = 'change'
1335 self.dumpfile.write('Node-path: %s\n'
1336 'Node-kind: file\n'
1337 'Node-action: %s\n'
1338 'Prop-content-length: %d\n'
1339 'Text-content-length: '
1340 % (self.utf8_path(svn_path), action, props_len))
1342 pos = self.dumpfile.tell()
1344 self.dumpfile.write('0000000000000000\n'
1345 'Text-content-md5: 00000000000000000000000000000000\n'
1346 'Content-length: 0000000000000000\n'
1347 '\n')
1349 self.dumpfile.write(prop_contents + 'PROPS-END\n')
1351 # Insert the rev contents, calculating length and checksum as we go.
1352 checksum = md5.new()
1353 length = 0
1354 buf = pipe.read()
1355 while buf:
1356 checksum.update(buf)
1357 length = length + len(buf)
1358 self.dumpfile.write(buf)
1359 buf = pipe.read()
1360 if pipe.close() is not None:
1361 sys.exit('%s: Command failed: "%s"' % (error_prefix, pipe_cmd))
1363 # Go back to patch up the length and checksum headers:
1364 self.dumpfile.seek(pos, 0)
1365 # We left 16 zeros for the text length; replace them with the real
1366 # length, padded on the left with spaces:
1367 self.dumpfile.write('%16d' % length)
1368 # 16... + 1 newline + len('Text-content-md5: ') == 35
1369 self.dumpfile.seek(pos + 35, 0)
1370 self.dumpfile.write(checksum.hexdigest())
1371 # 35... + 32 bytes of checksum + 1 newline + len('Content-length: ') == 84
1372 self.dumpfile.seek(pos + 84, 0)
1373 # The content length is the length of property data, text data,
1374 # and any metadata around/inside around them.
1375 self.dumpfile.write('%16d' % (length + props_len))
1376 # Jump back to the end of the stream
1377 self.dumpfile.seek(0, 2)
1379 # This record is done (write two newlines -- one to terminate
1380 # contents that weren't themselves newline-termination, one to
1381 # provide a blank line for readability.
1382 self.dumpfile.write('\n\n')
1383 return change.closed_tags, change.closed_branches
1385 def delete_path(self, svn_path, tags, branches, prune=None):
1386 """If SVN_PATH exists in the head mirror, output the deletion to
1387 the dumpfile, else output nothing to the dumpfile.
1389 Return a tuple (path_deleted, closed_tags, closed_branches), where
1390 path_deleted is the path deleted if any or None if no deletion was
1391 necessary, and closed_tags and closed_names are lists of symbolic
1392 names closed off by this deletion -- that is, tags or branches
1393 which could be rooted in the previous revision of PATH, but not in
1394 this revision, because this rev changes PATH. If path_deleted is
1395 None, then closed_tags and closed_branches will both be empty.
1397 Iff PRUNE is true, then the path deleted can be not None, yet
1398 shorter than SVN_PATH because of pruning."""
1399 deleted_path, closed_tags, closed_branches \
1400 = self.repos_mirror.delete_path(svn_path, tags,
1401 branches, prune)
1402 if deleted_path:
1403 print " (deleted '%s')" % deleted_path
1404 self.dumpfile.write('Node-path: %s\n'
1405 'Node-action: delete\n'
1406 '\n' % self.utf8_path(deleted_path))
1407 return deleted_path, closed_tags, closed_branches
1409 def close(self):
1410 self.repos_mirror.close()
1412 # If we're only making a dumpfile, we should be done now. Just
1413 # close the dumpfile. Otherwise, we're in "incremental" mode, and
1414 # we need to close our incremental dumpfile, flush it to the
1415 # repository, and then remove it.
1416 if self.dump_only:
1417 self.dumpfile.close()
1418 else:
1419 self.flush_and_remove_dumpfile()
1420 ret = self.loader_pipe.close()
1421 if ret:
1422 sys.stderr.write('%s: svnadmin load exited with error code %s' %
1423 (error_prefix, ret))
1424 sys.exit(1)
1427 def format_date(date):
1428 """Return an svn-compatible date string for DATE (seconds since epoch)."""
1429 # A Subversion date looks like "2002-09-29T14:44:59.000000Z"
1430 return time.strftime("%Y-%m-%dT%H:%M:%S.000000Z", time.gmtime(date))
1433 def make_revision_props(ctx, symbolic_name, is_tag, date=None):
1434 """Return a dictionary of revision properties for the manufactured
1435 commit that finished SYMBOLIC_NAME. If IS_TAG is true, write the
1436 log message as though for a tag, else as though for a branch.
1437 If DATE is passed, use it as the value of the svn:date property."""
1438 if is_tag:
1439 type = 'tag'
1440 else:
1441 type = 'branch'
1443 # In Python 2.2.3, we could use textwrap.fill(). Oh well :-).
1444 if len(symbolic_name) >= 13:
1445 space_or_newline = '\n'
1446 else:
1447 space_or_newline = ' '
1449 log = "This commit was manufactured by cvs2svn to create %s%s'%s'." \
1450 % (type, space_or_newline, symbolic_name)
1452 return { 'svn:author' : ctx.username,
1453 'svn:log' : log,
1454 'svn:date' : date or format_date(time.time())}
1457 class SymbolicNameTracker:
1458 """Track the Subversion path/revision ranges of CVS symbolic names.
1459 This is done in a .db file, representing a tree in the usual way.
1460 In addition to directory entries, each object in the database stores
1461 the earliest revision from which it could be copied, and the first
1462 revision from which it could no longer be copied. Intermediate
1463 directories go one step farther: they record counts for the various
1464 revisions from which items under them could have been copied, and
1465 counts for the cutoff revisions. For example:
1467 .----------.
1468 | sub1 | [(2, 1), (3, 3)]
1469 | / | [(5, 1), (17, 2), (50, 1)]
1470 | / |
1471 |/ sub2 |
1472 / \ |
1473 /|_____\____|
1474 / \
1475 ______/ \_________
1476 / \
1477 / \
1478 / \
1479 .---------. .---------.
1480 | file1 | | file3 |
1481 | / | [(3, 2)] | \ | [(2, 1), (3, 1)]
1482 | / | [(17, 1), (50, 1)] | \ | [(5, 1), (10, 1)]
1483 | / | | \ |
1484 |/ file2 | | file4 \|
1485 / \ | | / \
1486 /|_____\___| |___/_____|\
1487 / \ / \
1488 / \ / \
1489 / \ / \
1490 / + / +
1491 +======+ | +======+ |
1492 | | [(3, 1)] | | | [(2, 1)] |
1493 | | [(17, 1)] | | | [(5, 1)] |
1494 | | | | | |
1495 +======+ | +======+ |
1496 +======+ +======+
1497 | | [(3, 1)] | | [(3, 1)]
1498 | | [(50, 1)] | | [(17, 1)]
1499 | | | |
1500 +======+ +======+
1502 The two lists to the right of each node represent the 'opening' and
1503 'closing' revisions respectively. Each tuple in a list is of the
1504 form (REV, COUNT). For leaf nodes, COUNT is always 1, of course.
1505 For intermediate nodes, the counts are the sums of the corresponding
1506 counts of child nodes.
1508 These revision scores are used to determine the optimal copy
1509 revisions for each tree/subtree at branch or tag creation time.
1511 The svn path input will most often be a trunk path, because the
1512 path/rev information recorded here is about where and when the given
1513 symbolic name could be rooted, *not* a path/rev for which commits
1514 along that symbolic name take place (of course, commits only happen on
1515 branches anyway)."""
1517 def __init__(self):
1518 self.db_file = SYMBOLIC_NAMES_DB
1519 self.db = Database(self.db_file, 'n')
1520 self.root_key = gen_key()
1521 self.db[self.root_key] = {}
1523 # The keys for the opening and closing revision lists attached to
1524 # each directory or file. Includes "/" so as never to conflict
1525 # with any real entry.
1526 self.tags_opening_revs_key = "/tag-openings"
1527 self.tags_closing_revs_key = "/tag-closings"
1528 self.br_opening_revs_key = "/br-openings"
1529 self.br_closing_revs_key = "/br-closings"
1531 # When a node is copied into the repository, the revision copied
1532 # is stored under the appropriate key, and the corresponding
1533 # opening and closing rev lists are removed.
1534 self.tags_copyfrom_rev_key = "/tags-copyfrom-rev"
1535 self.br_copyfrom_rev_key = "/br-copyfrom-rev"
1537 def probe_path(self, symbolic_name, path, debugging=None):
1538 """If 'SYMBOLIC_NAME/PATH' exists in the symbolic name tree,
1539 return the value of its last component, else return None.
1540 PATH may be None, but may not start with '/'.
1541 If DEBUGGING is true, then print trace output to stdout."""
1542 if path:
1543 components = [symbolic_name] + string.split(path, '/')
1544 else:
1545 components = [symbolic_name]
1547 if debugging:
1548 print "PROBING SYMBOLIC NAME:\n", components
1550 parent_key = self.root_key
1551 parent = self.db[parent_key]
1552 last_component = "/"
1553 i = 1
1554 for component in components:
1555 if debugging:
1556 print " " * i,
1557 print "'%s' key: %s, val:" % (last_component, parent_key), parent
1559 # Check for a "can't happen."
1560 if not parent.has_key(component):
1561 sys.stderr.write("%s: sym probe failed: '%s' does not contain '%s'\n"
1562 % (error_prefix, last_component, component))
1563 sys.exit(1)
1565 this_entry_key = parent[component]
1566 this_entry_val = self.db[this_entry_key]
1567 parent_key = this_entry_key
1568 parent = this_entry_val
1569 last_component = component
1570 i = i + 1
1572 if debugging:
1573 print " " * i,
1574 print "parent_key: %s, val:" % parent_key, parent
1576 # It's not actually a parent at this point, it's the leaf node.
1577 return parent
1579 def bump_rev_count(self, item_key, rev, revlist_key):
1580 """Increment REV's count in opening or closing list under KEY.
1581 REVLIST_KEY is self.*_opening_revs_key or self.*_closing_revs_key,
1582 and indicates which rev list to increment REV's count in.
1584 For example, if REV is 7, REVLIST_KEY is
1585 self.tags_opening_revs_key, and the entry's tags opening revs list
1586 looks like this
1588 [(2, 5), (7, 2), (10, 15)]
1590 then afterwards it would look like this:
1592 [(2, 5), (7, 3), (10, 15)]
1594 But if no tuple for revision 7 were present, then one would be
1595 added, for example
1597 [(2, 5), (10, 15)]
1599 would become
1601 [(2, 5), (7, 1), (10, 15)]
1603 The list is sorted by ascending revision both before and after."""
1605 entry_val = self.db[item_key]
1607 if not entry_val.has_key(revlist_key):
1608 entry_val[revlist_key] = [(rev, 1)]
1609 else:
1610 rev_counts = entry_val[revlist_key]
1611 for i in range(len(rev_counts)):
1612 this_rev, this_count = rev_counts[i]
1613 if rev == this_rev:
1614 rev_counts[i] = (this_rev, this_count + 1)
1615 break
1616 elif this_rev > rev:
1617 if i > 0:
1618 i = i - 1
1619 rev_counts.insert(i, (rev, 1))
1620 break
1621 else:
1622 rev_counts.append((rev, 1))
1623 entry_val[revlist_key] = rev_counts
1625 self.db[item_key] = entry_val
1627 # The verb form of "root" is "root", but that would be misleading in
1628 # this case; and the opposite of "uproot" is presumably "downroot",
1629 # but that wouldn't exactly clarify either. Hence, "enroot" :-).
1630 def enroot_names(self, svn_path, svn_rev, names, opening_key):
1631 """Record SVN_PATH at SVN_REV as the earliest point from which the
1632 symbolic names in NAMES could be copied. OPENING_KEY is
1633 self.tags_opening_revs_key or self.br_opening_revs_key, to
1634 indicate whether NAMES contains tag names or branch names.
1635 SVN_PATH does not start with '/'."""
1637 # Guard against names == None
1638 if not names:
1639 return
1641 for name in names:
1642 components = [name] + string.split(svn_path, '/')
1643 parent_key = self.root_key
1644 for component in components:
1645 self.bump_rev_count(parent_key, svn_rev, opening_key)
1646 parent = self.db[parent_key]
1647 if not parent.has_key(component):
1648 new_child_key = gen_key()
1649 parent[component] = new_child_key
1650 self.db[new_child_key] = {}
1651 self.db[parent_key] = parent
1652 # One way or another, parent now has an entry for component.
1653 this_entry_key = parent[component]
1654 this_entry_val = self.db[this_entry_key]
1655 # Swaparoo.
1656 parent_key = this_entry_key
1657 parent = this_entry_val
1659 self.bump_rev_count(parent_key, svn_rev, opening_key)
1661 def enroot_tags(self, svn_path, svn_rev, tags):
1662 """Record SVN_PATH at SVN_REV as the earliest point from which the
1663 symbolic names in TAGS could be copied. SVN_PATH does not start
1664 with '/'."""
1665 self.enroot_names(svn_path, svn_rev, tags, self.tags_opening_revs_key)
1667 def enroot_branches(self, svn_path, svn_rev, branches):
1668 """Record SVN_PATH at SVN_REV as the earliest point from which the
1669 symbolic names in BRANCHES could be copied. SVN_PATH does not
1670 start with '/'."""
1671 self.enroot_names(svn_path, svn_rev, branches, self.br_opening_revs_key)
1673 def close_names(self, svn_path, svn_rev, names, closing_key):
1674 """Record that as of SVN_REV, SVN_PATH could no longer be the
1675 source from which any of symbolic names in NAMES could be copied.
1676 CLOSING_KEY is self.tags_closing_revs_key or
1677 self.br_closing_revs_key, to indicate whether NAMES are tags or
1678 branches. SVN_PATH does not start with '/'."""
1680 # Guard against names == None
1681 if not names:
1682 return
1684 for name in names:
1685 components = [name] + string.split(svn_path, '/')
1686 parent_key = self.root_key
1687 for component in components:
1688 self.bump_rev_count(parent_key, svn_rev, closing_key)
1689 parent = self.db[parent_key]
1690 # Check for a "can't happen".
1691 if not parent.has_key(component):
1692 sys.stderr.write("%s: in path '%s', value for parent key '%s' "
1693 "does not have entry '%s'\n"
1694 % (error_prefix, svn_path, parent_key, component))
1695 sys.exit(1)
1696 this_entry_key = parent[component]
1697 this_entry_val = self.db[this_entry_key]
1698 # Swaparoo.
1699 parent_key = this_entry_key
1700 parent = this_entry_val
1702 self.bump_rev_count(parent_key, svn_rev, closing_key)
1704 def close_tags(self, svn_path, svn_rev, tags):
1705 """Record that as of SVN_REV, SVN_PATH could no longer be the
1706 source from which any of TAGS could be copied. SVN_PATH does not
1707 start with '/'."""
1708 self.close_names(svn_path, svn_rev, tags, self.tags_closing_revs_key)
1710 def close_branches(self, svn_path, svn_rev, branches):
1711 """Record that as of SVN_REV, SVN_PATH could no longer be the
1712 source from which any of BRANCHES could be copied. SVN_PATH does
1713 not start with '/'."""
1714 self.close_names(svn_path, svn_rev, branches, self.br_closing_revs_key)
1716 def score_revisions(self, openings, closings):
1717 """Return a list of revisions and scores based on OPENINGS and
1718 CLOSINGS. The returned list looks like:
1720 [(REV1 SCORE1), (REV2 SCORE2), ...]
1722 where REV2 > REV1. OPENINGS and CLOSINGS are the values of
1723 self.tags_opening_revs_key and self.tags_closing_revs_key, or
1724 self.br_opening_revs_key and self.br_closing_revs_key, from some file or
1725 directory node, or else None.
1727 Each score indicates that copying the corresponding revision (or any
1728 following revision up to the next revision in the list) of
1729 the object in question would yield that many correct paths at or
1730 underneath the object. There may be other paths underneath it
1731 which are not correct and need to be deleted or recopied; those
1732 can only be detected by descending and examining their scores.
1734 If OPENINGS is false, return the empty list."""
1736 # First look for easy outs.
1737 if not openings:
1738 return []
1740 # Must be able to call len(closings) below.
1741 if closings is None:
1742 closings = []
1744 # No easy out, so wish for lexical closures and calculate the scores :-).
1745 scores = []
1746 opening_score_accum = 0
1747 for i in range(len(openings)):
1748 opening_rev, opening_score = openings[i]
1749 opening_score_accum = opening_score_accum + opening_score
1750 scores.append((opening_rev, opening_score_accum))
1751 min = 0
1752 for i in range(len(closings)):
1753 closing_rev, closing_score = closings[i]
1754 done_exact_rev = None
1755 insert_index = None
1756 insert_score = None
1757 for j in range(min, len(scores)):
1758 score_rev, score = scores[j]
1759 if score_rev >= closing_rev:
1760 if not done_exact_rev:
1761 if score_rev > closing_rev:
1762 insert_index = j
1763 insert_score = scores[j-1][1] - closing_score
1764 done_exact_rev = 1
1765 scores[j] = (score_rev, score - closing_score)
1766 else:
1767 min = j + 1
1768 if not done_exact_rev:
1769 scores.append((closing_rev,scores[-1][1] - closing_score))
1770 if insert_index is not None:
1771 scores.insert(insert_index, (closing_rev, insert_score))
1772 return scores
1774 def best_rev(self, scores, prefer_rev, limit_rev):
1775 """Return the revision older than LIMIT_REV with the highest score
1776 from SCORES, a list returned by score_revisions(). When the maximum score
1777 is shared by multiple revisions, the oldest revision is selected, unless
1778 PREFER_REV is one of the possibilities, in which case, it is selected."""
1779 max_score = 0
1780 prefer_rev_score = -1
1781 rev = SVN_INVALID_REVNUM
1782 for pair in scores:
1783 if pair[1] > max_score and pair[0] < limit_rev:
1784 max_score = pair[1]
1785 rev = pair[0]
1786 if pair[0] <= prefer_rev:
1787 prefer_rev_score = pair[1]
1788 if prefer_rev_score == max_score:
1789 rev = prefer_rev
1790 return rev
1792 def is_best_rev(self, scores, rev, limit_rev):
1793 """Return true if REV has the highest score for revisions older than
1794 LIMIT_REV from SCORES, a list returned by score_revisions()."""
1795 return self.best_rev(scores, rev, limit_rev) == rev
1797 # Helper for copy_descend().
1798 def cleanup_entries(self, rev, limit_rev, entries, is_tag):
1799 """Return a copy of ENTRIES, minus the individual entries whose
1800 highest scoring revision doesn't match REV (and also, minus and
1801 special '/'-denoted flags). IS_TAG is 1 or None, based on whether
1802 this work is being done for the sake of a tag or a branch."""
1803 if is_tag:
1804 opening_key = self.tags_opening_revs_key
1805 closing_key = self.tags_closing_revs_key
1806 else:
1807 opening_key = self.br_opening_revs_key
1808 closing_key = self.br_closing_revs_key
1810 new_entries = {}
1811 for key in entries.keys():
1812 if key[0] == '/': # Skip flags
1813 continue
1814 entry = entries.get(key)
1815 val = self.db[entry]
1816 scores = self.score_revisions(val.get(opening_key), val.get(closing_key))
1817 if self.is_best_rev(scores, rev, limit_rev):
1818 new_entries[key] = entry
1819 return new_entries
1821 # Helper for fill_branch().
1822 def copy_descend(self, dumper, ctx, name, parent, entry_name,
1823 parent_rev, src_path, dst_path, is_tag, jit_new_rev=None):
1824 """Starting with ENTRY_NAME in directory object PARENT at
1825 PARENT_REV, use DUMPER and CTX to copy nodes in the Subversion
1826 repository, manufacturing the source paths with SRC_PATH and the
1827 destination paths with NAME and DST_PATH.
1829 If IS_TAG is true, NAME is treated as a tag, else as a branch.
1831 If JIT_NEW_REV is not None, it is a list of one or two elements.
1832 If the first element is true, then if any copies are to be made,
1833 invoke DUMPER.start_revision() before the first copy, then set
1834 JIT_NEW_REV[0] to None, so no more new revisions are made for this
1835 symbolic name anywhere in this descent.
1837 The second element, if present, is the string to be used for the svn:date
1838 property of any JIT-created revision.
1840 ('JIT' == 'Just In Time'.)"""
1841 ### Hmmm, is passing [1] instead of 1 an idiomatic way of passing
1842 ### a side-effectable boolean in Python? That's how the
1843 ### JIT_NEW_REV parameter works here and elsewhere, but maybe
1844 ### there's a clearer way to do it?
1846 key = parent[entry_name]
1847 val = self.db[key]
1849 if is_tag:
1850 opening_key = self.tags_opening_revs_key
1851 closing_key = self.tags_closing_revs_key
1852 copyfrom_rev_key = self.tags_copyfrom_rev_key
1853 else:
1854 opening_key = self.br_opening_revs_key
1855 closing_key = self.br_closing_revs_key
1856 copyfrom_rev_key = self.br_copyfrom_rev_key
1858 limit_rev = dumper.revision
1859 if jit_new_rev and jit_new_rev[0]:
1860 # Because in this case the current rev is complete,
1861 # so is a valid copyfrom source
1862 limit_rev = limit_rev + 1
1864 if not val.has_key(copyfrom_rev_key):
1865 # If not already copied this subdir, calculate its "best rev"
1866 # and see if it differs from parent's best rev.
1867 scores = self.score_revisions(val.get(opening_key), val.get(closing_key))
1868 rev = self.best_rev(scores, parent_rev, limit_rev)
1870 if rev == SVN_INVALID_REVNUM:
1871 return # name is a branch, but we're doing a tag, or vice versa
1873 else:
1874 if is_tag:
1875 copy_dst = make_path(ctx, dst_path, None, name)
1876 else:
1877 copy_dst = make_path(ctx, dst_path, name, None)
1879 expected_entries = self.cleanup_entries(rev, limit_rev,
1880 val, is_tag)
1881 if (rev != parent_rev):
1882 if jit_new_rev and jit_new_rev[0]:
1883 dumper.start_revision(make_revision_props(ctx, name, is_tag,
1884 len(jit_new_rev) > 1 and jit_new_rev[1] or None))
1885 jit_new_rev[0] = None
1886 if dumper.copy_path(src_path, rev, copy_dst, expected_entries):
1887 parent_rev = rev
1888 else:
1889 # If we didn't copy, then we need to prune
1890 dumper.prune_entries(copy_dst, expected_entries)
1891 else:
1892 # Even if we kept the already-present revision of this entry
1893 # instead of copying a new one, we still need to prune out
1894 # anything that's not part of the symbolic name.
1895 dumper.prune_entries(copy_dst, expected_entries)
1897 # Record that this copy is done:
1898 val[copyfrom_rev_key] = parent_rev
1899 if val.has_key(opening_key):
1900 del val[opening_key]
1901 if val.has_key(closing_key):
1902 del val[closing_key]
1903 self.db[key] = val
1905 for ent in val.keys():
1906 if not ent[0] == '/':
1907 if src_path:
1908 next_src = src_path + '/' + ent
1909 else:
1910 next_src = ent
1911 if dst_path:
1912 next_dst = dst_path + '/' + ent
1913 else:
1914 next_dst = ent
1915 self.copy_descend(dumper, ctx, name, val, ent, parent_rev,
1916 next_src, next_dst, is_tag, jit_new_rev)
1918 def fill_name(self, dumper, ctx, name, is_tag, jit_new_rev=None):
1919 """Use DUMPER to create all currently available parts of symbolic
1920 name NAME that have not been created already.
1922 If IS_TAG is true, NAME is treated as a tag, else as a branch.
1924 JIT_NEW_REV is as documented for the copy_descend() function."""
1926 # A source path looks like this in the symbolic name tree:
1928 # thisbranch/trunk/proj/foo/bar/baz.c
1930 # ...or occasionally...
1932 # thisbranch/branches/sourcebranch/proj/foo/bar/baz.c
1934 # (the latter when 'thisbranch' is branched off 'sourcebranch').
1936 # Meanwhile, we're copying to a location in the repository like
1938 # /branches/thisbranch/proj/foo/bar/baz.c or
1939 # /tags/tagname/proj/foo/bar/baz.c
1941 # Of course all this depends on make_path()'s behavior. At
1942 # various times we've changed the way it produces paths (see
1943 # revisions 6028 and 6347). If it changes again, the logic here
1944 # must be adjusted to match.
1946 parent_key = self.root_key
1947 parent = self.db[parent_key]
1949 # If there are no origin records, then we must've messed up earlier.
1950 if not parent.has_key(name):
1951 if is_tag:
1952 sys.stderr.write("%s: no origin records for tag '%s'.\n"
1953 % (error_prefix, name))
1954 else:
1955 sys.stderr.write("%s: no origin records for branch '%s'.\n"
1956 % (error_prefix, name))
1957 sys.exit(1)
1959 parent_key = parent[name]
1960 parent = self.db[parent_key]
1962 # All Subversion source paths under the branch start with one of
1963 # three things:
1965 # /trunk/...
1966 # /branches/foo/...
1967 # /tags/foo/...
1969 # (We don't care what foo is, it's just a component to skip over.)
1971 # Since these don't all have the same number of components, we
1972 # manually descend into each as far as necessary, then invoke
1973 # copy_descend() once we're in the right place in both trees.
1975 # Since it's possible for a branch or tag to have some source
1976 # paths on trunk and some on branches, there's some question about
1977 # what to copy as the top-level directory of the branch. Our
1978 # solution is to [somewhat randomly] give preference to trunk.
1979 # Note that none of these paths can ever conflict; for example,
1980 # it would be impossible to have both
1982 # thisbranch/trunk/myproj/lib/drivers.c and
1983 # thisbranch/branches/sourcebranch/myproj/lib/drivers.c
1985 # because that would imply that the symbolic name 'thisbranch'
1986 # appeared twice in the RCS file header, referring to two
1987 # different revisions. Well, I suppose that's *possible*, but its
1988 # effect is undefined, and it's as reasonable for us to just
1989 # overwrite one with the other as anything else -- anyway, isn't
1990 # that what CVS would do if you checked out the branch? <shrug>
1992 if parent.has_key(ctx.trunk_base):
1993 self.copy_descend(dumper, ctx, name, parent, ctx.trunk_base,
1994 SVN_INVALID_REVNUM, ctx.trunk_base, "",
1995 is_tag, jit_new_rev)
1996 if parent.has_key(ctx.branches_base):
1997 branch_base_key = parent[ctx.branches_base]
1998 branch_base = self.db[branch_base_key]
1999 for this_source in branch_base.keys():
2000 # We skip special names beginning with '/' for the usual
2001 # reason. We skip cases where (this_source == name) for a
2002 # different reason: if a CVS branch were rooted in itself,
2003 # that would imply that the same symbolic name appeared on two
2004 # different branches in an RCS file, which CVS doesn't
2005 # permit. So while it wouldn't hurt to descend, it would be a
2006 # waste of time.
2007 if (this_source[0] != '/') and (this_source != name):
2008 src_path = ctx.branches_base + '/' + this_source
2009 self.copy_descend(dumper, ctx, name, branch_base, this_source,
2010 SVN_INVALID_REVNUM, src_path, "",
2011 is_tag, jit_new_rev)
2013 def fill_tag(self, dumper, ctx, tag, jit_new_rev=None):
2014 """Use DUMPER to create all currently available parts of TAG that
2015 have not been created already. Use CTX.trunk_base, CTX.tags_base,
2016 and CTX.branches_base to determine the source and destination
2017 paths in the Subversion repository.
2019 JIT_NEW_REV is as documented for the copy_descend() function."""
2020 self.fill_name(dumper, ctx, tag, 1, jit_new_rev)
2022 def fill_branch(self, dumper, ctx, branch, jit_new_rev=None):
2023 """Use DUMPER to create all currently available parts of BRANCH that
2024 haven't been created already. Use CTX.trunk_base, CTX.tags_base,
2025 and CTX.branches_base to determine the source and destination
2026 paths in the Subversion repository.
2028 JIT_NEW_REV is as documented for the copy_descend() function."""
2029 self.fill_name(dumper, ctx, branch, None, jit_new_rev)
2031 def finish(self, dumper, ctx):
2032 """Use DUMPER to finish branches and tags that have either
2033 not been created yet, or have been only partially created.
2034 Use CTX.trunk_base, CTX.tags_base, and CTX.branches_base to
2035 determine the source and destination paths in the Subversion
2036 repository."""
2037 parent_key = self.root_key
2038 parent = self.db[parent_key]
2039 # Do all branches first, then all tags. We don't bother to check
2040 # here whether a given name is a branch or a tag, or is done
2041 # already; the fill_foo() methods will just do nothing if there's
2042 # nothing to do.
2044 # We do one revision per branch or tag, for clarity to users, not
2045 # for correctness. In CVS, when you make a branch off a branch,
2046 # the new branch will just root itself in the roots of the old
2047 # branch *except* where the new branch sprouts from a revision
2048 # that was actually committed on the old branch. In the former
2049 # cases, the source paths will be the same as the source paths
2050 # from which the old branch was created and therefore will already
2051 # exist; and in the latter case, the source paths will actually be
2052 # on the old branch, but those paths will exist already because
2053 # they were commits on that branch and therefore cvs2svn must have
2054 # created it already (see the fill_branch call in Commit.commit).
2055 # So either way, the source paths exist by the time we need them.
2057 ### It wouldn't be so awfully hard to determine whether a name is
2058 ### just a branch or just a tag, which would allow for more
2059 ### intuitive messages below.
2060 if not ctx.trunk_only:
2061 print "Finishing branches:"
2062 for name in parent.keys():
2063 if name[0] != '/':
2064 print "finishing '%s' as branch" % name
2065 self.fill_branch(dumper, ctx, name, [1])
2066 print "Finishing tags:"
2067 for name in parent.keys():
2068 if name[0] != '/':
2069 print "finishing '%s' as tag" % name
2070 self.fill_tag(dumper, ctx, name, [1])
2073 def is_trunk_vendor_revision(default_branches_db, cvs_path, cvs_rev):
2074 """Return 1 if CVS_REV of CVS_PATH is a trunk (i.e., head) vendor
2075 revision according to DEFAULT_BRANCHES_DB, else return None."""
2076 if default_branches_db.has_key(cvs_path):
2077 val = default_branches_db[cvs_path]
2078 val_last_dot = val.rindex(".")
2079 received_last_dot = cvs_rev.rindex(".")
2080 default_branch = val[:val_last_dot]
2081 received_branch = cvs_rev[:received_last_dot]
2082 default_rev_component = int(val[val_last_dot + 1:])
2083 received_rev_component = int(cvs_rev[received_last_dot + 1:])
2084 if (default_branch == received_branch
2085 and received_rev_component <= default_rev_component):
2086 return 1
2087 # else
2088 return None
2091 class Commit:
2092 def __init__(self, author, log):
2093 self.author = author
2094 self.log = log
2096 self.files = { }
2098 # For consistency, the elements of both lists are of the form
2100 # (file, rev, deltatext_code, branch_name, tags, branches)
2102 # even though self.deletes doesn't use the deltatext_code.
2103 self.changes = [ ]
2104 self.deletes = [ ]
2106 # Start out with a t_min higher than any incoming time T, and a
2107 # t_max lower than any incoming T. This way the first T will
2108 # push t_min down to T, and t_max up to T, naturally (without any
2109 # special-casing), and successive times will then ratchet them
2110 # outward as appropriate.
2111 self.t_min = 1L<<32
2112 self.t_max = 0
2114 def has_file(self, fname):
2115 return self.files.has_key(fname)
2117 def add(self, t, op, file, rev, deltatext_code, branch_name, tags, branches):
2118 # Record the time range of this commit.
2120 # ### ISSUE: It's possible, though unlikely, that the time range
2121 # of a commit could get gradually expanded to be arbitrarily
2122 # longer than COMMIT_THRESHOLD. I'm not sure this is a huge
2123 # problem, and anyway deciding where to break it up would be a
2124 # judgement call. For now, we just print a warning in commit() if
2125 # this happens.
2126 if t < self.t_min:
2127 self.t_min = t
2128 if t > self.t_max:
2129 self.t_max = t
2131 if op == OP_CHANGE:
2132 self.changes.append((file, rev, deltatext_code, branch_name,
2133 tags, branches))
2134 else:
2135 # OP_DELETE
2136 self.deletes.append((file, rev, deltatext_code, branch_name,
2137 tags, branches))
2138 self.files[file] = 1
2140 def commit(self, dumper, ctx, sym_tracker):
2141 # commit this transaction
2142 seconds = self.t_max - self.t_min
2143 print 'committing: %s, over %d seconds' % (time.ctime(self.t_min), seconds)
2144 if seconds > COMMIT_THRESHOLD:
2145 print '%s: commit spans more than %d seconds' \
2146 % (warning_prefix, COMMIT_THRESHOLD)
2148 if ctx.dry_run:
2149 for f, r, dt_code, br, tags, branches in self.changes:
2150 # compute a repository path, dropping the ,v from the file name
2151 svn_path = make_path(ctx, relative_name(ctx.cvsroot, f[:-2]), br)
2152 print " adding or changing '%s' : '%s'" % (r, svn_path)
2153 for f, r, dt_code, br, tags, branches in self.deletes:
2154 # compute a repository path, dropping the ,v from the file name
2155 svn_path = make_path(ctx, relative_name(ctx.cvsroot, f[:-2]), br)
2156 print " deleting '%s' : '%s'" % (r, svn_path)
2157 print ' (skipped; dry run enabled)'
2158 return
2160 do_copies = [ ]
2162 # State for handling default branches.
2164 # Here is a tempting, but ultimately nugatory, bit of logic, which
2165 # I share with you so you may appreciate the less attractive, but
2166 # refreshingly non-nugatory, logic which follows it:
2168 # If some of the commits in this txn happened on a non-trunk
2169 # default branch, then those files will have to be copied into
2170 # trunk manually after being changed on the branch (because the
2171 # RCS "default branch" appears as head, i.e., trunk, in practice).
2172 # As long as those copies don't overwrite any trunk paths that
2173 # were also changed in this commit, then we can do the copies in
2174 # the same revision, because they won't cover changes that don't
2175 # appear anywhere/anywhen else. However, if some of the trunk dst
2176 # paths *did* change in this commit, then immediately copying the
2177 # branch changes would lose those trunk mods forever. So in this
2178 # case, we need to do at least that copy in its own revision. And
2179 # for simplicity's sake, if we're creating the new revision for
2180 # even one file, then we just do all such copies together in the
2181 # new revision.
2183 # Doesn't that sound nice?
2185 # Unfortunately, Subversion doesn't support copies with sources
2186 # in the current txn. All copies must be based in committed
2187 # revisions. Therefore, we generate the above-described new
2188 # revision unconditionally.
2190 # Each of these is a list of tuples. Each tuple is of the form:
2192 # (cvs_path, branch_name, tags_rooted_here, branches_rooted_here)
2194 # and a tuple is created for each default branch commit that will
2195 # need to be copied to trunk (or deleted from trunk) in the
2196 # generated revision following the "regular" revision.
2197 default_branch_copies = [ ]
2198 default_branch_deletes = [ ]
2200 # we already have the date, so just format it
2201 date = format_date(self.t_max)
2202 try:
2203 ### FIXME: The 'replace' behavior should be an option, like
2204 ### --encoding is.
2205 unicode_author = unicode(self.author, ctx.encoding, 'replace')
2206 unicode_log = unicode(self.log, ctx.encoding, 'replace')
2207 props = { 'svn:author' : unicode_author.encode('utf8'),
2208 'svn:log' : unicode_log.encode('utf8'),
2209 'svn:date' : date }
2210 except UnicodeError:
2211 print '%s: problem encoding author or log message:' % warning_prefix
2212 print " author: '%s'" % self.author
2213 print " log: '%s'" % self.log
2214 print " date: '%s'" % date
2215 for rcs_file, cvs_rev, dt_code, br, tags, branches in self.changes:
2216 print " rev %s of '%s'" % (cvs_rev, rcs_file)
2217 print "Consider rerunning with (for example) '--encoding=latin1'."
2218 # Just fall back to the original data.
2219 props = { 'svn:author' : self.author,
2220 'svn:log' : self.log,
2221 'svn:date' : date }
2224 # Tells whether we actually wrote anything to the dumpfile.
2225 svn_rev = SVN_INVALID_REVNUM
2227 # If any of the changes we are about to do are on branches, we need to
2228 # check and maybe fill them (in their own revisions) *before* we start
2229 # then data revision. So we have to iterate over changes and deletes twice.
2230 for rcs_file, cvs_rev, dt_code, br, tags, branches in self.changes:
2231 # compute a repository path, dropping the ,v from the file name
2232 cvs_path = relative_name(ctx.cvsroot, rcs_file[:-2])
2233 svn_path = make_path(ctx, cvs_path, br)
2234 if br:
2235 ### FIXME: Here is an obvious optimization point. Probably
2236 ### dump.probe_path(PATH) is kind of slow, because it does N
2237 ### database lookups for the N components in PATH. If this
2238 ### turns out to be a performance bottleneck, we can just
2239 ### maintain a database mirroring just the head tree, but
2240 ### keyed on full paths, to reduce the check to a quick
2241 ### constant time query.
2242 if not dumper.probe_path(svn_path):
2243 sym_tracker.fill_branch(dumper, ctx, br, [1, date])
2245 for rcs_file, cvs_rev, dt_code, br, tags, branches in self.deletes:
2246 # compute a repository path, dropping the ,v from the file name
2247 cvs_path = relative_name(ctx.cvsroot, rcs_file[:-2])
2248 svn_path = make_path(ctx, cvs_path, br)
2249 if br:
2250 ### FIXME: Here is an obvious optimization point. Probably
2251 ### dump.probe_path(PATH) is kind of slow, because it does N
2252 ### database lookups for the N components in PATH. If this
2253 ### turns out to be a performance bottleneck, we can just
2254 ### maintain a database mirroring just the head tree, but
2255 ### keyed on full paths, to reduce the check to a quick
2256 ### constant time query.
2257 if not dumper.probe_path(svn_path):
2258 sym_tracker.fill_branch(dumper, ctx, br, [1, date])
2261 # Now that any branches we need exist, we can do the commits.
2262 for rcs_file, cvs_rev, dt_code, br, tags, branches in self.changes:
2263 # compute a repository path, dropping the ,v from the file name
2264 cvs_path = relative_name(ctx.cvsroot, rcs_file[:-2])
2265 svn_path = make_path(ctx, cvs_path, br)
2266 if svn_rev == SVN_INVALID_REVNUM:
2267 svn_rev = dumper.start_revision(props)
2268 sym_tracker.enroot_tags(svn_path, svn_rev, tags)
2269 sym_tracker.enroot_branches(svn_path, svn_rev, branches)
2270 print " adding or changing %s : '%s'" % (cvs_rev, svn_path)
2272 # Only make a change if we need to. When 1.1.1.1 has an empty
2273 # deltatext, the explanation is almost always that we're looking
2274 # at an imported file whose 1.1 and 1.1.1.1 are identical. On
2275 # such imports, CVS creates an RCS file where 1.1 has the
2276 # content, and 1.1.1.1 has an empty deltatext, i.e, the same
2277 # content as 1.1. There's no reason to reflect this non-change
2278 # in the repository, so we want to do nothing in this case. (If
2279 # we were really paranoid, we could make sure 1.1's log message
2280 # is the CVS-generated "Initial revision\n", but I think the
2281 # conditions below are strict enough.)
2282 if not ((dt_code == DELTATEXT_EMPTY) and (cvs_rev == "1.1.1.1")
2283 and dumper.probe_path(svn_path)):
2284 closed_tags, closed_branches = \
2285 dumper.add_or_change_path(cvs_path,
2286 svn_path,
2287 cvs_rev,
2288 rcs_file,
2289 tags,
2290 branches,
2291 ctx.cvs_revnums)
2292 if is_trunk_vendor_revision(ctx.default_branches_db,
2293 cvs_path, cvs_rev):
2294 default_branch_copies.append((cvs_path, br, tags, branches))
2295 sym_tracker.close_tags(svn_path, svn_rev, closed_tags)
2296 sym_tracker.close_branches(svn_path, svn_rev, closed_branches)
2298 for rcs_file, cvs_rev, dt_code, br, tags, branches in self.deletes:
2299 # compute a repository path, dropping the ,v from the file name
2300 cvs_path = relative_name(ctx.cvsroot, rcs_file[:-2])
2301 svn_path = make_path(ctx, cvs_path, br)
2302 print " deleting %s : '%s'" % (cvs_rev, svn_path)
2303 if svn_rev == SVN_INVALID_REVNUM:
2304 svn_rev = dumper.start_revision(props)
2305 # Uh, can this even happen on a deleted path? Hmmm. If not,
2306 # there's no risk, since tags and branches would just be empty
2307 # and therefore enrooting would be a no-op. Still, it would
2308 # be clearer to know for sure and simply not call it.
2309 sym_tracker.enroot_tags(svn_path, svn_rev, tags)
2310 sym_tracker.enroot_branches(svn_path, svn_rev, branches)
2311 ### FIXME: this will return path_deleted == None if no path
2312 ### was deleted. But we'll already have started the revision
2313 ### by then, so it's a bit late to use the knowledge! Need to
2314 ### reorganize things so that starting the revision is a
2315 ### callback with its own internal conditional, so anyone can
2316 ### just invoke when they know they're really about to do
2317 ### something.
2319 ### Right now what happens is we get an empty revision
2320 ### (assuming nothing else happened in this revision).
2321 path_deleted, closed_tags, closed_branches = \
2322 dumper.delete_path(svn_path, tags, branches, ctx.prune)
2323 if is_trunk_vendor_revision(ctx.default_branches_db, cvs_path, cvs_rev):
2324 default_branch_deletes.append((cvs_path, br, tags, branches))
2325 sym_tracker.close_tags(svn_path, svn_rev, closed_tags)
2326 sym_tracker.close_branches(svn_path, svn_rev, closed_branches)
2328 if svn_rev == SVN_INVALID_REVNUM:
2329 print ' no new revision created, as nothing to do'
2330 else:
2331 print ' new revision:', svn_rev
2332 if default_branch_copies or default_branch_deletes:
2333 previous_rev = svn_rev
2334 msg = 'This commit was generated by cvs2svn to compensate for ' \
2335 'changes in r%d,\n' \
2336 'which included commits to RCS files with non-trunk default ' \
2337 'branches.\n' % previous_rev
2338 props = { 'svn:author' : 'cvs2svn',
2339 'svn:log' : msg,
2340 'svn:date' : date }
2341 svn_rev = dumper.start_revision(props)
2343 for cvs_path, br, tags, branches in default_branch_copies:
2344 src_path = make_path(ctx, cvs_path, br)
2345 dst_path = make_path(ctx, cvs_path)
2346 if (dumper.probe_path(dst_path)):
2347 ign, closed_tags, closed_branches = \
2348 dumper.delete_path(dst_path, tags, branches, ctx.prune)
2349 sym_tracker.close_tags(dst_path, svn_rev, closed_tags)
2350 sym_tracker.close_branches(dst_path, svn_rev, closed_branches)
2351 dumper.copy_path(src_path, previous_rev, dst_path)
2353 for cvs_path, br, tags, branches in default_branch_deletes:
2354 # Ignore the branch -- we don't need to know the default
2355 # branch, we already know we're deleting this from trunk.
2356 dst_path = make_path(ctx, cvs_path)
2357 if (dumper.probe_path(dst_path)):
2358 ign, closed_tags, closed_branches = \
2359 dumper.delete_path(dst_path, tags, branches, ctx.prune)
2360 sym_tracker.close_tags(dst_path, svn_rev, closed_tags)
2361 sym_tracker.close_branches(dst_path, svn_rev, closed_branches)
2364 def read_resync(fname):
2365 "Read the .resync file into memory."
2367 ### note that we assume that we can hold the entire resync file in
2368 ### memory. really large repositories with whacky timestamps could
2369 ### bust this assumption. should that ever happen, then it is possible
2370 ### to split the resync file into pieces and make multiple passes,
2371 ### using each piece.
2374 # A digest maps to a sequence of lists which specify a lower and upper
2375 # time bound for matching up the commit. We keep a sequence of these
2376 # because a number of checkins with the same log message (e.g. an empty
2377 # log message) could need to be remapped. We also make them a list because
2378 # we will dynamically expand the lower/upper bound as we find commits
2379 # that fall into a particular msg and time range.
2381 # resync == digest -> [ [old_time_lower, old_time_upper, new_time], ... ]
2383 resync = { }
2385 for line in fileinput.FileInput(fname):
2386 t1 = int(line[:8], 16)
2387 digest = line[9:DIGEST_END_IDX]
2388 t2 = int(line[DIGEST_END_IDX+1:], 16)
2389 t1_l = t1 - COMMIT_THRESHOLD/2
2390 t1_u = t1 + COMMIT_THRESHOLD/2
2391 if resync.has_key(digest):
2392 resync[digest].append([t1_l, t1_u, t2])
2393 else:
2394 resync[digest] = [ [t1_l, t1_u, t2] ]
2396 # For each digest, sort the resync items in it in increasing order,
2397 # based on the lower time bound.
2398 digests = resync.keys()
2399 for digest in digests:
2400 (resync[digest]).sort()
2402 return resync
2405 def parse_revs_line(line):
2406 data = line.split(' ', 7)
2407 timestamp = int(data[0], 16)
2408 id = data[1]
2409 op = data[2]
2410 rev = data[3]
2411 deltatext_code = data[4]
2412 branch_name = data[5]
2413 if branch_name == "*":
2414 branch_name = None
2415 ntags = int(data[6])
2416 tags = data[7].split(' ', ntags + 1)
2417 nbranches = int(tags[ntags])
2418 branches = tags[ntags + 1].split(' ', nbranches)
2419 fname = branches[nbranches][:-1] # strip \n
2420 tags = tags[:ntags]
2421 branches = branches[:nbranches]
2423 return timestamp, id, op, rev, deltatext_code, \
2424 fname, branch_name, tags, branches
2427 def write_revs_line(output, timestamp, digest, op, revision,
2428 deltatext_code, fname, branch_name, tags, branches):
2429 output.write('%08lx %s %s %s %s ' % \
2430 (timestamp, digest, op, revision, deltatext_code))
2431 if not branch_name:
2432 branch_name = "*"
2433 output.write('%s ' % branch_name)
2434 output.write('%d ' % (len(tags)))
2435 for tag in tags:
2436 output.write('%s ' % (tag))
2437 output.write('%d ' % (len(branches)))
2438 for branch in branches:
2439 output.write('%s ' % (branch))
2440 output.write('%s\n' % fname)
2443 def pass1(ctx):
2444 cd = CollectData(ctx.cvsroot, DATAFILE, ctx.default_branches_db)
2445 p = rcsparse.Parser()
2446 stats = [ 0 ]
2447 os.path.walk(ctx.cvsroot, visit_file, (cd, p, stats))
2448 if ctx.verbose:
2449 print 'processed', stats[0], 'files'
2450 if len(cd.fatal_errors) > 0:
2451 sys.exit("Pass 1 complete.\n" + "=" * 75 + "\n"
2452 + "Error summary:\n"
2453 + "\n".join(cd.fatal_errors)
2454 + "\nExited due to fatal error(s).")
2456 def pass2(ctx):
2457 "Pass 2: clean up the revision information."
2459 # We may have recorded some changes in revisions' timestamp. We need to
2460 # scan for any other files which may have had the same log message and
2461 # occurred at "the same time" and change their timestamps, too.
2463 # read the resync data file
2464 resync = read_resync(ctx.log_fname_base + RESYNC_SUFFIX)
2466 output = open(ctx.log_fname_base + CLEAN_REVS_SUFFIX, 'w')
2468 # process the revisions file, looking for items to clean up
2469 for line in fileinput.FileInput(ctx.log_fname_base + REVS_SUFFIX):
2470 timestamp, digest, op, rev, deltatext_code, fname, \
2471 branch_name, tags, branches = parse_revs_line(line)
2472 if not resync.has_key(digest):
2473 output.write(line)
2474 continue
2476 # we have a hit. see if this is "near" any of the resync records we
2477 # have recorded for this digest [of the log message].
2478 for record in resync[digest]:
2479 if record[0] <= timestamp <= record[1]:
2480 # bingo! remap the time on this (record[2] is the new time).
2481 write_revs_line(output, record[2], digest, op, rev,
2482 deltatext_code, fname, branch_name, tags, branches)
2484 print "RESYNC: '%s' (%s) : old time='%s' new time='%s'" \
2485 % (relative_name(ctx.cvsroot, fname),
2486 rev, time.ctime(timestamp), time.ctime(record[2]))
2488 # adjust the time range. we want the COMMIT_THRESHOLD from the
2489 # bounds of the earlier/latest commit in this group.
2490 record[0] = min(record[0], timestamp - COMMIT_THRESHOLD/2)
2491 record[1] = max(record[1], timestamp + COMMIT_THRESHOLD/2)
2493 # stop looking for hits
2494 break
2495 else:
2496 # the file/rev did not need to have its time changed.
2497 output.write(line)
2500 def pass3(ctx):
2501 # sort the log files
2503 # GNU sort will sort our dates differently (incorrectly!) if our
2504 # LC_ALL is anything but 'C', so if LC_ALL is set, temporarily set
2505 # it to 'C'
2506 lc_all_tmp = os.getenv('LC_ALL')
2507 os.putenv('LC_ALL', 'C')
2508 run_command('sort %s > %s' % (ctx.log_fname_base + CLEAN_REVS_SUFFIX,
2509 ctx.log_fname_base + SORTED_REVS_SUFFIX))
2510 if lc_all_tmp is not None:
2511 os.putenv('LC_ALL', lc_all_tmp)
2512 else:
2513 os.unsetenv('LC_ALL')
2516 def pass4(ctx):
2517 sym_tracker = SymbolicNameTracker()
2518 metadata_db = Database(METADATA_DB, 'r')
2520 # A dictionary of Commit objects, keyed by digest. Each object
2521 # represents one logical commit, which may involve multiple files.
2523 # The reason this is a dictionary, not a single object, is that
2524 # there may be multiple commits interleaved in time. A commit can
2525 # span up to COMMIT_THRESHOLD seconds, which leaves plenty of time
2526 # for parts of some other commit to occur. Since the s-revs file is
2527 # sorted by timestamp first, then by digest within each timestamp,
2528 # it's quite easy to have interleaved commits.
2529 commits = { }
2531 # The total number of separate commits processed. This is used only for
2532 # printing statistics, it does not affect the results in the repository.
2533 count = 0
2535 # Start the dumpfile object.
2536 dumper = Dumper(ctx)
2538 # process the logfiles, creating the target
2539 for line in fileinput.FileInput(ctx.log_fname_base + SORTED_REVS_SUFFIX):
2540 timestamp, id, op, rev, deltatext_code, fname, \
2541 branch_name, tags, branches = parse_revs_line(line)
2543 if ctx.trunk_only and not trunk_rev.match(rev):
2544 ### note this could/should have caused a flush, but the next item
2545 ### will take care of that for us
2546 continue
2548 # Each time we read a new line, we scan the commits we've
2549 # accumulated so far to see if any are ready for processing now.
2550 process = [ ]
2551 for scan_id, scan_c in commits.items():
2552 if scan_c.t_max + COMMIT_THRESHOLD < timestamp:
2553 process.append((scan_c.t_max, scan_c))
2554 del commits[scan_id]
2555 continue
2556 # If the inbound commit is on the same file as a pending commit,
2557 # close the pending commit to further changes. Don't flush it though,
2558 # as there may be other pending commits dated before this one.
2559 # ### ISSUE: the has_file() check below is not optimal.
2560 # It does fix the dataloss bug where revisions would get lost
2561 # if checked in too quickly, but it can also break apart the
2562 # commits. The correct fix would require tracking the dependencies
2563 # between change sets and committing them in proper order.
2564 if scan_c.has_file(fname):
2565 unused_id = scan_id + '-'
2566 while commits.has_key(unused_id):
2567 unused_id = unused_id + '-'
2568 commits[unused_id] = scan_c
2569 del commits[scan_id]
2571 # If there are any elements in 'process' at this point, they need
2572 # to be committed, because this latest rev couldn't possibly be
2573 # part of any of them. Sort them into time-order, then commit 'em.
2574 process.sort()
2575 for t_max, c in process:
2576 c.commit(dumper, ctx, sym_tracker)
2577 count = count + len(process)
2579 # Add this item into the set of still-available commits.
2580 if commits.has_key(id):
2581 c = commits[id]
2582 else:
2583 author, log = metadata_db[id]
2584 c = commits[id] = Commit(author, log)
2585 c.add(timestamp, op, fname, rev, deltatext_code, branch_name,
2586 tags, branches)
2588 # End of the sorted revs file. Flush any remaining commits:
2589 if commits:
2590 process = [ ]
2591 for id, c in commits.items():
2592 process.append((c.t_max, c))
2593 process.sort()
2594 for t_max, c in process:
2595 c.commit(dumper, ctx, sym_tracker)
2596 count = count + len(process)
2598 # Create (or complete) any branches and tags not already done.
2599 sym_tracker.finish(dumper, ctx)
2601 dumper.close()
2603 if ctx.verbose:
2604 print count, 'commits processed.'
2607 def pass5(ctx):
2608 if ctx.skip_cleanup:
2609 return
2611 # Remove our database files
2612 os.unlink(SVN_REVISIONS_DB)
2613 os.unlink(NODES_DB)
2614 os.unlink(SYMBOLIC_NAME_ROOTS_DB)
2615 os.unlink(SYMBOLIC_NAMES_DB)
2616 os.unlink(METADATA_DB)
2618 # This is the only DB reference still reachable at this point; lose
2619 # it before removing the file.
2620 ctx.default_branches_db = None
2621 os.unlink(DEFAULT_BRANCHES_DB)
2623 # Remove our other data files
2624 for suffix in (REVS_SUFFIX, CLEAN_REVS_SUFFIX,
2625 SORTED_REVS_SUFFIX, RESYNC_SUFFIX):
2626 os.unlink('cvs2svn-data' + suffix)
2629 _passes = [
2630 pass1,
2631 pass2,
2632 pass3,
2633 pass4,
2634 pass5,
2638 class _ctx:
2639 pass
2642 def convert(ctx, start_pass=1):
2643 "Convert a CVS repository to an SVN repository."
2645 if not os.path.exists(ctx.cvsroot):
2646 sys.stderr.write(error_prefix + ': \'%s\' does not exist.\n' % ctx.cvsroot)
2647 sys.exit(1)
2649 times = [ None ] * len(_passes)
2650 for i in range(start_pass - 1, len(_passes)):
2651 times[i] = time.time()
2652 print '----- pass %d -----' % (i + 1)
2653 _passes[i](ctx)
2654 times.append(time.time())
2656 for i in range(start_pass, len(_passes)+1):
2657 print 'pass %d: %d seconds' % (i, int(times[i] - times[i-1]))
2658 print ' total:', int(times[len(_passes)] - times[start_pass-1]), 'seconds'
2661 def usage(ctx):
2662 print 'USAGE: %s [-n] [-v] [-s svn-repos-path] [-p pass] cvs-repos-path' \
2663 % os.path.basename(sys.argv[0])
2664 print ' --help, -h print this usage message and exit with success'
2665 print ' -n dry run; parse CVS repos, but do not construct SVN repos'
2666 print ' -v verbose'
2667 print ' -s PATH path for SVN repos'
2668 print ' -p NUM start at pass NUM of %d' % len(_passes)
2669 print ' --existing-svnrepos load into existing SVN repository'
2670 print ' --dumpfile=PATH name of intermediate svn dumpfile'
2671 print ' --svnadmin=PATH path to the svnadmin program'
2672 print ' --trunk-only convert only trunk commits, not tags nor branches'
2673 print ' --trunk=PATH path for trunk (default: %s)' \
2674 % ctx.trunk_base
2675 print ' --branches=PATH path for branches (default: %s)' \
2676 % ctx.branches_base
2677 print ' --tags=PATH path for tags (default: %s)' \
2678 % ctx.tags_base
2679 print ' --no-prune don\'t prune empty directories'
2680 print ' --dump-only just produce a dumpfile, don\'t commit to a repos'
2681 print ' --encoding=ENC encoding of log messages in CVS repos (default: %s)' \
2682 % ctx.encoding
2683 print ' --username=NAME username for cvs2svn-synthesized commits'
2684 print ' (default: %s)' \
2685 % ctx.username
2686 print ' --skip-cleanup prevent the deletion of intermediate files'
2687 print ' --bdb-txn-nosync pass --bdb-txn-nosync to "svnadmin create"'
2688 print ' --cvs-revnums record CVS revision numbers as file properties'
2692 def main():
2693 # prepare the operation context
2694 ctx = _ctx()
2695 ctx.cvsroot = None
2696 ctx.target = None
2697 ctx.log_fname_base = DATAFILE
2698 ctx.dumpfile = DUMPFILE
2699 ctx.verbose = 0
2700 ctx.dry_run = 0
2701 ctx.prune = 1
2702 ctx.existing_svnrepos = 0
2703 ctx.dump_only = 0
2704 ctx.trunk_only = 0
2705 ctx.trunk_base = "trunk"
2706 ctx.tags_base = "tags"
2707 ctx.branches_base = "branches"
2708 ctx.encoding = "ascii"
2709 ctx.svnadmin = "svnadmin"
2710 ctx.username = "unknown"
2711 ctx.print_help = 0
2712 ctx.skip_cleanup = 0
2713 ctx.cvs_revnums = 0
2714 ctx.bdb_txn_nosync = 0
2716 start_pass = 1
2718 try:
2719 opts, args = getopt.getopt(sys.argv[1:], 'p:s:vnh',
2720 [ "help", "create", "trunk=",
2721 "username=", "existing-svnrepos",
2722 "branches=", "tags=", "encoding=",
2723 "trunk-only", "no-prune",
2724 "dump-only", "dumpfile=", "svnadmin=",
2725 "skip-cleanup", "cvs-revnums",
2726 "bdb-txn-nosync"])
2727 except getopt.GetoptError, e:
2728 sys.stderr.write(error_prefix + ': ' + str(e) + '\n\n')
2729 usage(ctx)
2730 sys.exit(1)
2732 for opt, value in opts:
2733 if opt == '-p':
2734 start_pass = int(value)
2735 if start_pass < 1 or start_pass > len(_passes):
2736 print '%s: illegal value (%d) for starting pass. ' \
2737 'must be 1 through %d.' % (error_prefix, start_pass,
2738 len(_passes))
2739 sys.exit(1)
2740 elif (opt == '--help') or (opt == '-h'):
2741 ctx.print_help = 1
2742 elif opt == '-v':
2743 ctx.verbose = 1
2744 elif opt == '-n':
2745 ctx.dry_run = 1
2746 elif opt == '-s':
2747 ctx.target = value
2748 elif opt == '--existing-svnrepos':
2749 ctx.existing_svnrepos = 1
2750 elif opt == '--dumpfile':
2751 ctx.dumpfile = value
2752 elif opt == '--svnadmin':
2753 ctx.svnadmin = value
2754 elif opt == '--trunk-only':
2755 ctx.trunk_only = 1
2756 elif opt == '--trunk':
2757 ctx.trunk_base = value
2758 elif opt == '--branches':
2759 ctx.branches_base = value
2760 elif opt == '--tags':
2761 ctx.tags_base = value
2762 elif opt == '--no-prune':
2763 ctx.prune = None
2764 elif opt == '--dump-only':
2765 ctx.dump_only = 1
2766 elif opt == '--encoding':
2767 ctx.encoding = value
2768 elif opt == '--username':
2769 ctx.username = value
2770 elif opt == '--skip-cleanup':
2771 ctx.skip_cleanup = 1
2772 elif opt == '--cvs-revnums':
2773 ctx.cvs_revnums = 1
2774 elif opt == '--bdb-txn-nosync':
2775 ctx.bdb_txn_nosync = 1
2776 elif opt == '--create':
2777 sys.stderr.write(warning_prefix +
2778 ': The behaviour produced by the --create option is now the '
2779 'default,\nand passing the option is deprecated.\n')
2781 if ctx.print_help:
2782 usage(ctx)
2783 sys.exit(0)
2785 # Consistency check for options and arguments.
2786 if len(args) == 0:
2787 usage(ctx)
2788 sys.exit(1)
2790 if len(args) > 1:
2791 sys.stderr.write(error_prefix +
2792 ": must pass only one CVS repository.\n")
2793 usage(ctx)
2794 sys.exit(1)
2796 ctx.cvsroot = args[0]
2798 if not os.path.isdir(ctx.cvsroot):
2799 sys.stderr.write(error_prefix +
2800 ": the cvs-repos-path '%s' is not an "
2801 "existing directory.\n" % ctx.cvsroot)
2802 sys.exit(1)
2804 if (not ctx.target) and (not ctx.dump_only):
2805 sys.stderr.write(error_prefix +
2806 ": must pass one of '-s' or '--dump-only'.\n")
2807 sys.exit(1)
2809 def not_both(opt1val, opt1name, opt2val, opt2name):
2810 if opt1val and opt2val:
2811 sys.stderr.write(error_prefix + ": cannot pass both '%s' and '%s'.\n" \
2812 % (opt1name, opt2name))
2814 not_both(ctx.target, '-s', ctx.dump_only, '--dump-only')
2816 not_both(ctx.dump_only, '--dump-only',
2817 ctx.existing_svnrepos, '--existing-svnrepos')
2819 not_both(ctx.bdb_txn_nosync, '--bdb-txn-nosync',
2820 ctx.existing_svnrepos, '--existing-svnrepos')
2822 not_both(ctx.dump_only, '--dump-only',
2823 ctx.bdb_txn_nosync, '--bdb-txn-nosync')
2825 if ((string.find(ctx.trunk_base, '/') > -1)
2826 or (string.find(ctx.tags_base, '/') > -1)
2827 or (string.find(ctx.branches_base, '/') > -1)):
2828 sys.stderr.write("%s: cannot pass multicomponent path to "
2829 "--trunk, --tags, or --branches yet.\n"
2830 " See http://subversion.tigris.org/issues/show_bug.cgi?"
2831 "id=1409 "
2832 "for details.\n" % error_prefix)
2833 sys.exit(1)
2835 if ctx.existing_svnrepos and not os.path.isdir(ctx.target):
2836 sys.stderr.write(error_prefix +
2837 ": the svn-repos-path '%s' is not an "
2838 "existing directory.\n" % ctx.target)
2839 sys.exit(1)
2841 if not ctx.dump_only and not ctx.existing_svnrepos \
2842 and os.path.exists(ctx.target):
2843 sys.stderr.write(error_prefix +
2844 ": the svn-repos-path '%s' exists.\nRemove it, or pass "
2845 "'--existing-svnrepos'.\n" % ctx.target)
2846 sys.exit(1)
2848 ctx.default_branches_db = Database(DEFAULT_BRANCHES_DB, 'n')
2850 convert(ctx, start_pass=start_pass)
2853 if __name__ == '__main__':
2854 main()