Resolve issue 27 ("two instances of cvs2svn invoked from the same dir thrash
[cvs2svn.git] / cvs2svn.py
blobddcd39e9c835b178ed17bfe77e5679cd5133b1ea
1 #!/usr/bin/env python
3 # cvs2svn: ...
5 # $LastChangedRevision$
7 # ====================================================================
8 # Copyright (c) 2000-2004 CollabNet. All rights reserved.
10 # This software is licensed as described in the file COPYING, which
11 # you should have received as part of this distribution. The terms
12 # are also available at http://subversion.tigris.org/license-1.html.
13 # If newer versions of this license are posted there, you may use a
14 # newer version instead, at your option.
16 # This software consists of voluntary contributions made by many
17 # individuals. For exact contribution history, see the revision
18 # history and logs, available at http://cvs2svn.tigris.org/.
19 # ====================================================================
21 import rcsparse
22 import os
23 import sys
24 import sha
25 import re
26 import time
27 import fileinput
28 import string
29 import getopt
30 import stat
31 import string
32 import md5
33 import anydbm
34 import marshal
36 # Warnings and errors start with these strings. They are typically
37 # followed by a colon and a space, as in "%s: " ==> "Warning: ".
38 warning_prefix = "Warning"
39 error_prefix = "Error"
41 # Make sure this Python is recent enough.
42 if sys.hexversion < 0x2000000:
43 sys.stderr.write("'%s: Python 2.0 or higher required, "
44 "see www.python.org.\n" % error_prefix)
45 sys.exit(1)
47 # Don't settle for less.
48 if (anydbm._defaultmod.__name__ == 'dumbdbm'
49 or anydbm._defaultmod.__name__ == 'dbm'):
50 print 'ERROR: your installation of Python does not contain a suitable'
51 print ' DBM module. This script cannot continue.'
52 print ' to solve: see http://python.org/doc/current/lib/module-anydbm.html'
53 print ' for details.'
54 sys.exit(1)
56 if hasattr(anydbm._defaultmod, 'bsddb') \
57 and not hasattr(anydbm._defaultmod.bsddb, '__version__'):
58 try:
59 gdbm = __import__('gdbm')
60 except ImportError:
61 sys.stderr.write(warning_prefix +
62 ': The version of the bsddb module found '
63 'on your computer has been reported to malfunction on some datasets, '
64 'causing KeyError exceptions. You may wish to upgrade your Python to '
65 'version 2.3 or later.\n')
66 else:
67 anydbm._defaultmod = gdbm
69 trunk_rev = re.compile('^[0-9]+\\.[0-9]+$')
70 branch_tag = re.compile('^[0-9.]+\\.0\\.[0-9]+$')
71 vendor_tag = re.compile('^[0-9]+\\.[0-9]+\\.[0-9]+$')
73 # This really only matches standard '1.1.1.*'-style vendor revisions.
74 # One could conceivably have a file whose default branch is 1.1.3 or
75 # whatever, or was that at some point in time, with vendor revisions
76 # 1.1.3.1, 1.1.3.2, etc. But with the default branch gone now (which
77 # is the only time this regexp gets used), we'd have no basis for
78 # assuming that the non-standard vendor branch had ever been the
79 # default branch anyway, so we don't want this to match them anyway.
80 vendor_revision = re.compile('^(1\\.1\\.1)\\.([0-9])+$')
82 DATAFILE = 'cvs2svn-data'
83 DUMPFILE = 'cvs2svn-dump' # The "dumpfile" we create to load into the repos
85 # Skeleton version of an svn filesystem.
86 # See class RepositoryMirror for how these work.
87 SVN_REVISIONS_DB = 'cvs2svn-revisions.db'
88 NODES_DB = 'cvs2svn-nodes.db'
90 # os.popen() on Windows seems to require an access-mode string of 'rb'
91 # in cases where the process will output binary information to stdout.
92 # Without the 'b' we get IOErrors upon closing the pipe. Unfortunately
93 # 'rb' isn't accepted in the Linux version of os.popen(). As a purely
94 # practical matter, we compensate by switching on os.name.
95 if os.name == 'nt':
96 PIPE_READ_MODE = 'rb'
97 PIPE_WRITE_MODE = 'wb'
98 else:
99 PIPE_READ_MODE = 'r'
100 PIPE_WRITE_MODE = 'w'
102 # Record the default RCS branches, if any, for CVS filepaths.
104 # The keys are CVS filepaths, relative to the top of the repository
105 # and with the ",v" stripped off, so they match the cvs paths used in
106 # Commit.commit(). The values are vendor branch revisions, such as
107 # '1.1.1.1', or '1.1.1.2', or '1.1.1.96'. The vendor branch revision
108 # represents the highest vendor branch revision thought to have ever
109 # been head of the default branch.
111 # The reason we record a specific vendor revision, rather than a
112 # default branch number, is that there are two cases to handle:
114 # One case is simple. The RCS file lists a default branch explicitly
115 # in its header, such as '1.1.1'. In this case, we know that every
116 # revision on the vendor branch is to be treated as head of trunk at
117 # that point in time.
119 # But there's also a degenerate case. The RCS file does not currently
120 # have a default branch, yet we can deduce that for some period in the
121 # past it probably *did* have one. For example, the file has vendor
122 # revisions 1.1.1.1 -> 1.1.1.96, all of which are dated before 1.2,
123 # and then it has 1.1.1.97 -> 1.1.1.100 dated after 1.2. In this
124 # case, we should record 1.1.1.96 as the last vendor revision to have
125 # been the head of the default branch.
126 DEFAULT_BRANCHES_DB = 'cvs2svn-default-branches.db'
128 # Records the origin ranges for branches and tags.
129 # See class RepositoryMirror for how this works.
130 SYMBOLIC_NAME_ROOTS_DB = 'cvs2svn-symroots.db'
132 # See class SymbolicNameTracker for details.
133 SYMBOLIC_NAMES_DB = "cvs2svn-sym-names.db"
135 # Records the author and log message for each changeset.
136 # The keys are author+log digests, the same kind used to identify
137 # unique revisions in the .revs, etc files. Each value is a tuple
138 # of two elements: '(author logmessage)'.
139 METADATA_DB = "cvs2svn-metadata.db"
141 REVS_SUFFIX = '.revs'
142 CLEAN_REVS_SUFFIX = '.c-revs'
143 SORTED_REVS_SUFFIX = '.s-revs'
144 RESYNC_SUFFIX = '.resync'
146 ATTIC = os.sep + 'Attic'
148 SVN_INVALID_REVNUM = -1
150 COMMIT_THRESHOLD = 5 * 60 # flush a commit if a 5 minute gap occurs
152 # Things that can happen to a file.
153 OP_NOOP = '-'
154 OP_ADD = 'A'
155 OP_DELETE = 'D'
156 OP_CHANGE = 'C'
158 # A deltatext either does or doesn't represent some change.
159 DELTATEXT_NONEMPTY = 'N'
160 DELTATEXT_EMPTY = 'E'
162 DIGEST_END_IDX = 9 + (sha.digestsize * 2)
164 # Officially, CVS symbolic names must use a fairly restricted set of
165 # characters. Unofficially, CVS 1.10 allows any character but [$,.:;@]
166 # We don't care if some repositories out there use characters outside the
167 # official set, as long as their tags start with a letter.
168 # Since the unofficial set also includes [/\] we need to translate those
169 # into ones that don't conflict with Subversion limitations.
170 symbolic_name_re = re.compile('^[a-zA-Z].*$')
171 symbolic_name_transtbl = string.maketrans('/\\',',;')
173 # A wrapper for anydbm that uses the marshal module to store items as
174 # strings.
175 class Database:
176 def __init__(self, filename, mode):
177 self.db = anydbm.open(filename, mode)
179 def has_key(self, key):
180 return self.db.has_key(key)
182 def __getitem__(self, key):
183 return marshal.loads(self.db[key])
185 def __setitem__(self, key, value):
186 self.db[key] = marshal.dumps(value)
188 def __delitem__(self, key):
189 del self.db[key]
191 class CollectData(rcsparse.Sink):
192 def __init__(self, cvsroot, log_fname_base, default_branches_db):
193 self.cvsroot = cvsroot
194 self.revs = open(log_fname_base + REVS_SUFFIX, 'w')
195 self.resync = open(log_fname_base + RESYNC_SUFFIX, 'w')
196 self.default_branches_db = default_branches_db
197 self.metadata_db = Database(METADATA_DB, 'n')
198 self.fatal_errors = []
200 # Branch and tag label types.
201 self.BRANCH_LABEL = 0
202 self.VENDOR_BRANCH_LABEL = 1
203 self.TAG_LABEL = 2
204 # A label type to string conversion list
205 self.LABEL_TYPES = [ 'branch', 'vendor branch', 'tag' ]
206 # A dict mapping label names to types
207 self.label_type = { }
209 # See set_fname() for initializations of other variables.
211 def set_fname(self, fname):
212 "Prepare to receive data for a new file."
213 self.fname = fname
215 # revision -> [timestamp, author, operation, old-timestamp]
216 self.rev_data = { }
217 self.prev = { }
219 # Hash mapping branch numbers, like '1.7.2', to branch names,
220 # like 'Release_1_0_dev'.
221 self.branch_names = { }
223 # Hash mapping revision numbers, like '1.7', to lists of names
224 # indicating which branches sprout from that revision, like
225 # ['Release_1_0_dev', 'experimental_driver', ...].
226 self.branchlist = { }
228 # Like self.branchlist, but the values are lists of tag names that
229 # apply to the key revision.
230 self.taglist = { }
232 # This is always a number -- rcsparse calls this the "principal
233 # branch", but CVS and RCS refer to it as the "default branch",
234 # so that's what we call it, even though the rcsparse API setter
235 # method is still 'set_principal_branch'.
236 self.default_branch = None
238 # If the RCS file doesn't have a default branch anymore, but does
239 # have vendor revisions, then we make an educated guess that those
240 # revisions *were* the head of the default branch up until the
241 # commit of 1.2, at which point the file's default branch became
242 # trunk. This records the date at which 1.2 was committed.
243 self.first_non_vendor_revision_date = None
245 def set_principal_branch(self, branch):
246 self.default_branch = branch
248 def set_branch_name(self, branch_number, name):
249 """Record that BRANCH_NUMBER is the branch number for branch NAME,
250 and that NAME sprouts from BRANCH_NUMBER .
251 BRANCH_NUMBER is an RCS branch number with an odd number of components,
252 for example '1.7.2' (never '1.7.0.2')."""
253 if not self.branch_names.has_key(branch_number):
254 self.branch_names[branch_number] = name
255 # The branchlist is keyed on the revision number from which the
256 # branch sprouts, so strip off the odd final component.
257 sprout_rev = branch_number[:branch_number.rfind(".")]
258 if not self.branchlist.has_key(sprout_rev):
259 self.branchlist[sprout_rev] = []
260 self.branchlist[sprout_rev].append(name)
261 else:
262 sys.stderr.write("%s: in '%s':\n"
263 " branch '%s' already has name '%s',\n"
264 " cannot also have name '%s', ignoring the latter\n"
265 % (warning_prefix, self.fname, branch_number,
266 self.branch_names[branch_number], name))
268 def rev_to_branch_name(self, revision):
269 """Return the name of the branch on which REVISION lies.
270 REVISION is a non-branch revision number with an even number of,
271 components, for example '1.7.2.1' (never '1.7.2' nor '1.7.0.2').
272 For the convenience of callers, REVISION can also be a trunk
273 revision such as '1.2', in which case just return None."""
274 if trunk_rev.match(revision):
275 return None
276 return self.branch_names.get(revision[:revision.rindex(".")])
278 def add_cvs_branch(self, revision, branch_name):
279 """Record the root revision and branch revision for BRANCH_NAME,
280 based on REVISION. REVISION is a CVS branch number having an even
281 number of components where the second-to-last is '0'. For
282 example, if it's '1.7.0.2', then record that BRANCH_NAME sprouts
283 from 1.7 and has branch number 1.7.2."""
284 last_dot = revision.rfind(".")
285 branch_rev = revision[:last_dot]
286 last2_dot = branch_rev.rfind(".")
287 branch_rev = branch_rev[:last2_dot] + revision[last_dot:]
288 self.set_branch_name(branch_rev, branch_name)
290 def get_tags(self, revision):
291 """Return a list of all tag names attached to REVISION.
292 REVISION is a regular revision number like '1.7', and the result
293 never includes branch names, only plain tags."""
294 return self.taglist.get(revision, [])
296 def get_branches(self, revision):
297 """Return a list of all branch names that sprout from REVISION.
298 REVISION is a regular revision number like '1.7'."""
299 return self.branchlist.get(revision, [])
301 def define_tag(self, name, revision):
302 """Record a bidirectional mapping between symbolic NAME and REVISION.
303 REVISION is an unprocessed revision number from the RCS file's
304 header, for example: '1.7', '1.7.0.2', or '1.1.1' or '1.1.1.1'.
305 This function will determine what kind of symbolic name it is by
306 inspection, and record it in the right places."""
307 if not symbolic_name_re.match(name):
308 sys.stderr.write("%s: in '%s':\n"
309 " '%s' is not a valid tag or branch name, ignoring\n"
310 % (warning_prefix, self.fname, name))
311 elif branch_tag.match(revision):
312 label_type = self.BRANCH_LABEL
313 self.add_cvs_branch(revision, name)
314 elif vendor_tag.match(revision):
315 label_type = self.VENDOR_BRANCH_LABEL
316 self.set_branch_name(revision, name)
317 else:
318 label_type = self.TAG_LABEL
319 if not self.taglist.has_key(revision):
320 self.taglist[revision] = []
321 self.taglist[revision].append(name)
323 try:
324 # if label_types are different and at least one is a tag (We
325 # don't want to error on branch/vendor branch mismatches)
326 if (self.label_type[name] != label_type
327 and(self.label_type[name] == self.TAG_LABEL
328 or label_type == self.TAG_LABEL)):
329 err = ("%s: in '%s' (BRANCH/TAG MISMATCH):\n '%s' "
330 " is defined as %s here, but as a %s elsewhere"
331 % (error_prefix, self.fname, name,
332 self.LABEL_TYPES[label_type],
333 self.LABEL_TYPES[self.label_type[name]]))
334 sys.stderr.write(err)
335 self.fatal_errors.append(err)
336 except KeyError:
337 self.label_type[name] = label_type
339 def define_revision(self, revision, timestamp, author, state,
340 branches, next):
341 ### what else?
342 if state == 'dead':
343 op = OP_DELETE
344 else:
345 op = OP_CHANGE
347 # store the rev_data as a list in case we have to jigger the timestamp
348 self.rev_data[revision] = [int(timestamp), author, op, None]
350 # record the previous revision for sanity checking later
351 if trunk_rev.match(revision):
352 self.prev[revision] = next
353 elif next:
354 self.prev[next] = revision
355 for b in branches:
356 self.prev[b] = revision
358 # Ratchet up the highest vendor head revision, if necessary.
359 if self.default_branch:
360 if revision.find(self.default_branch) == 0:
361 # This revision is on the default branch, so record that it is
362 # the new highest vendor head revision.
363 rel_name = relative_name(self.cvsroot, self.fname)[:-2]
364 self.default_branches_db[rel_name] = revision
365 else:
366 # No default branch, so make an educated guess.
367 if revision == '1.2':
368 # This is probably the time when the file stopped having a
369 # default branch, so make a note of it.
370 self.first_non_vendor_revision_date = timestamp
371 else:
372 m = vendor_revision.match(revision)
373 if m and ((not self.first_non_vendor_revision_date)
374 or (timestamp < self.first_non_vendor_revision_date)):
375 # We're looking at a vendor revision, and it wasn't
376 # committed after this file lost its default branch, so bump
377 # the maximum trunk vendor revision in the permanent record.
378 rel_name = relative_name(self.cvsroot, self.fname)[:-2]
379 self.default_branches_db[rel_name] = revision
381 # Check for unlabeled branches, record them. We tried to collect
382 # all branch names when we parsed the symbolic name header
383 # earlier, of course, but that didn't catch unlabeled branches.
384 # If a branch is unlabeled, this is our first encounter with it,
385 # so we have to record its data now.
386 if not trunk_rev.match(revision):
387 branch_number = revision[:revision.rindex(".")]
388 branch_name = "unlabeled-" + branch_number
389 if not self.branch_names.has_key(branch_number):
390 self.set_branch_name(branch_number, branch_name)
392 def tree_completed(self):
393 "The revision tree has been parsed. Analyze it for consistency."
395 # Our algorithm depends upon the timestamps on the revisions occuring
396 # monotonically over time. That is, we want to see rev 1.34 occur in
397 # time before rev 1.35. If we inserted 1.35 *first* (due to the time-
398 # sorting), and then tried to insert 1.34, we'd be screwed.
400 # to perform the analysis, we'll simply visit all of the 'previous'
401 # links that we have recorded and validate that the timestamp on the
402 # previous revision is before the specified revision
404 # if we have to resync some nodes, then we restart the scan. just keep
405 # looping as long as we need to restart.
406 while 1:
407 for current, prev in self.prev.items():
408 if not prev:
409 # no previous revision exists (i.e. the initial revision)
410 continue
411 t_c = self.rev_data[current][0]
412 t_p = self.rev_data[prev][0]
413 if t_p >= t_c:
414 # the previous revision occurred later than the current revision.
415 # shove the previous revision back in time (and any before it that
416 # may need to shift).
417 while t_p >= t_c:
418 self.rev_data[prev][0] = t_c - 1 # new timestamp
419 self.rev_data[prev][3] = t_p # old timestamp
421 print "RESYNC: '%s' (%s) : old time='%s' new time='%s'" \
422 % (relative_name(self.cvsroot, self.fname),
423 prev, time.ctime(t_p), time.ctime(t_c - 1))
425 current = prev
426 prev = self.prev[current]
427 if not prev:
428 break
429 t_c = t_c - 1 # self.rev_data[current][0]
430 t_p = self.rev_data[prev][0]
432 # break from the for-loop
433 break
434 else:
435 # finished the for-loop (no resyncing was performed)
436 return
438 def set_revision_info(self, revision, log, text):
439 timestamp, author, op, old_ts = self.rev_data[revision]
440 digest = sha.new(log + '\0' + author).hexdigest()
441 if old_ts:
442 # the timestamp on this revision was changed. log it for later
443 # resynchronization of other files's revisions that occurred
444 # for this time and log message.
445 self.resync.write('%08lx %s %08lx\n' % (old_ts, digest, timestamp))
447 # "...Give back one kadam to honor the Hebrew God whose Ark this is."
448 # -- Imam to Indy and Sallah, in 'Raiders of the Lost Ark'
450 # If revision 1.1 appears to have been created via 'cvs add'
451 # instead of 'cvs import', then this file probably never had a
452 # default branch, so retroactively remove its record in the
453 # default branches db. The test is that the log message CVS uses
454 # for 1.1 in imports is "Initial revision\n" with no period.
455 if revision == '1.1' and log != 'Initial revision\n':
456 rel_name = relative_name(self.cvsroot, self.fname)[:-2]
457 if self.default_branches_db.has_key(rel_name):
458 del self.default_branches_db[rel_name]
460 if text:
461 deltatext_code = DELTATEXT_NONEMPTY
462 else:
463 deltatext_code = DELTATEXT_EMPTY
465 write_revs_line(self.revs, timestamp, digest, op, revision,
466 deltatext_code, self.fname,
467 self.rev_to_branch_name(revision),
468 self.get_tags(revision),
469 self.get_branches(revision))
471 if not self.metadata_db.has_key(digest):
472 self.metadata_db[digest] = (author, log)
474 def run_command(command):
475 if os.system(command):
476 sys.exit('Command failed: "%s"' % command)
478 def make_path(ctx, path, branch_name = None, tag_name = None):
479 """Return the trunk path, branch path, or tag path for PATH.
480 CTX holds the name of the branches or tags directory, which is
481 prepended to PATH when constructing a branch or tag path.
483 If PATH is empty or None, return the root trunk|branch|tag path.
485 It is an error to pass both a BRANCH_NAME and a TAG_NAME."""
487 # For a while, we treated each top-level subdir of the CVS
488 # repository as a "project root" and interpolated the appropriate
489 # genealogy (trunk|tag|branch) in according to the official
490 # recommended layout. For example, the path '/foo/bar/baz.c' on
491 # branch 'Rel2' would become
493 # /foo/branches/Rel2/bar/baz.c
495 # and on trunk it would become
497 # /foo/trunk/bar/baz.c
499 # However, we went back to the older and simpler method of just
500 # prepending the genealogy to the front, instead of interpolating.
501 # So now we produce:
503 # /branches/Rel2/foo/bar/baz.c
504 # /trunk/foo/bar/baz.c
506 # Why? Well, Jack Repenning pointed out that this way is much
507 # friendlier to "anonymously rooted subtrees" (that's a tree where
508 # the name of the top level dir doesn't matter, the point is that if
509 # you cd into it and, say, run 'make', something good will happen).
510 # By interpolating, we made it impossible to point cvs2svn at some
511 # subdir in the CVS repository and convert it as a project, because
512 # we'd treat every subdir underneath it as an independent project
513 # root, which is probably not what the user wanted.
515 # Also, see Blair Zajac's post
517 # http://subversion.tigris.org/servlets/ReadMsg?list=dev&msgNo=38965
519 # and the surrounding thread, for why what people really want is a
520 # way of specifying an in-repository prefix path, not interpolation.
522 # Check caller sanity.
523 if branch_name and tag_name:
524 sys.stderr.write("%s: make_path() miscalled: both branch and tag given.\n"
525 % error_prefix)
526 sys.exit(1)
528 if branch_name:
529 branch_name = branch_name.translate(symbolic_name_transtbl)
530 if path:
531 return ctx.branches_base + '/' + branch_name + '/' + path
532 else:
533 return ctx.branches_base + '/' + branch_name
534 elif tag_name:
535 tag_name = tag_name.translate(symbolic_name_transtbl)
536 if path:
537 return ctx.tags_base + '/' + tag_name + '/' + path
538 else:
539 return ctx.tags_base + '/' + tag_name
540 else:
541 if path:
542 return ctx.trunk_base + '/' + path
543 else:
544 return ctx.trunk_base
547 def relative_name(cvsroot, fname):
548 l = len(cvsroot)
549 if fname[:l] == cvsroot:
550 if fname[l] == os.sep:
551 return string.replace(fname[l+1:], os.sep, '/')
552 return string.replace(fname[l:], os.sep, '/')
553 sys.stderr.write("%s: relative_path('%s', '%s'): fname is not a sub-path of"
554 " cvsroot\n" % (error_prefix, cvsroot, fname))
555 sys.exit(1)
558 def visit_file(arg, dirname, files):
559 cd, p, stats = arg
560 for fname in files:
561 if fname[-2:] != ',v':
562 continue
563 pathname = os.path.join(dirname, fname)
564 if dirname[-6:] == ATTIC:
565 # drop the 'Attic' portion from the pathname
566 ### we should record this so we can easily insert it back in
567 cd.set_fname(os.path.join(dirname[:-6], fname))
568 else:
569 cd.set_fname(pathname)
570 print pathname
571 try:
572 p.parse(open(pathname, 'rb'), cd)
573 stats[0] = stats[0] + 1
574 except (rcsparse.common.RCSParseError, ValueError, RuntimeError):
575 err = "%s: '%s' is not a valid ,v file" \
576 % (error_prefix, pathname)
577 print err
578 cd.fatal_errors.append(err)
579 except:
580 print "Exception occurred while parsing %s" % pathname
581 raise
584 # Return a string that has not been returned by gen_key() before.
585 gen_key_base = 0L
586 def gen_key():
587 global gen_key_base
588 key = '%x' % gen_key_base
589 gen_key_base = gen_key_base + 1
590 return key
593 class Change:
594 """Class for recording what actually happened when a change is made,
595 because not all of the result is guessable by the caller.
596 See RepositoryMirror.change_path() for more.
598 The fields are
601 OP_ADD path was added, OP_CHANGE if changed, or OP_NOOP if no
602 action.
604 closed_tags:
605 List of tags that this path can no longer be the source of,
606 that is, tags which could be rooted in the path before the
607 change, but not after.
609 closed_branches:
610 Like closed_tags, but for branches.
612 deleted_entries:
613 The list of entries deleted from the destination after
614 copying a directory, or None.
616 copyfrom_rev:
617 The actual revision from which the path was copied, which
618 may be one less than the requested revision when the path
619 was deleted in the requested revision, or None."""
620 def __init__(self, op, closed_tags, closed_branches,
621 deleted_entries=None, copyfrom_rev=None):
622 self.op = op
623 self.closed_tags = closed_tags
624 self.closed_branches = closed_branches
625 self.deleted_entries = deleted_entries
626 self.copyfrom_rev = copyfrom_rev
629 class RepositoryMirror:
630 def __init__(self):
631 # This corresponds to the 'revisions' table in a Subversion fs.
632 self.revs_db_file = SVN_REVISIONS_DB
633 self.revs_db = Database(self.revs_db_file, 'n')
635 # This corresponds to the 'nodes' table in a Subversion fs. (We
636 # don't need a 'representations' or 'strings' table because we
637 # only track metadata, not file contents.)
638 self.nodes_db_file = NODES_DB
639 self.nodes_db = Database(self.nodes_db_file, 'n')
641 # This tracks which symbolic names the current "head" of a given
642 # filepath could be the origin node for. When the next commit on
643 # that path comes along, we can tell which symbolic names
644 # originated in the previous version, and signal back to the
645 # caller that the file can no longer be the origin for those names.
647 # The values are tuples, (tags, branches), where each value is a
648 # list.
649 self.symroots_db_file = SYMBOLIC_NAME_ROOTS_DB
650 self.symroots_db = Database(self.symroots_db_file, 'n')
652 # When copying a directory (say, to create part of a branch), we
653 # pass change_path() a list of expected entries, so it can remove
654 # any that are in the source but don't belong on the branch.
655 # However, because creating a given region of a branch can involve
656 # copying from several sources, we don't want later copy
657 # operations to delete entries that were legitimately created by
658 # earlier copy ops. So after a copy, the directory records
659 # legitimate entries under this key, in a dictionary (the keys are
660 # entry names, the values can be ignored).
661 self.approved_entries = "/approved-entries"
663 # Set to a true value on a directory that's mutable in the
664 # revision currently being constructed. (Yes, this is exactly
665 # analogous to the Subversion filesystem code's concept of
666 # mutability.)
667 # Is also overloaded with a second piece of information.
668 # If the value of the flag is 2, then in addition to the node
669 # being mutable, the node and all subnodes were created by a copy
670 # operation in the current revision. In this and only this
671 # circumstance, it is valid for pruning to occur.
672 self.mutable_flag = "/mutable"
673 # This could represent a new mutable directory or file.
674 self.empty_mutable_thang = { self.mutable_flag : 1 }
676 # Init a root directory with no entries at revision 0.
677 self.youngest = 0
678 youngest_key = gen_key()
679 self.revs_db[str(self.youngest)] = youngest_key
680 self.nodes_db[youngest_key] = {}
682 def new_revision(self):
683 """Stabilize the current revision, then start the next one.
684 (Increments youngest.)"""
685 self.stabilize_youngest()
686 self.revs_db[str(self.youngest + 1)] \
687 = self.revs_db[str(self.youngest)]
688 self.youngest = self.youngest + 1
690 def _stabilize_directory(self, key):
691 """Close the directory whose node key is KEY."""
692 dir = self.nodes_db[key]
693 if dir.has_key(self.mutable_flag):
694 del dir[self.mutable_flag]
695 if dir.has_key(self.approved_entries):
696 del dir[self.approved_entries]
697 for entry_key in dir.keys():
698 if not entry_key[0] == '/':
699 self._stabilize_directory(dir[entry_key])
700 self.nodes_db[key] = dir
702 def stabilize_youngest(self):
703 """Stabilize the current revision by removing mutable flags."""
704 root_key = self.revs_db[str(self.youngest)]
705 self._stabilize_directory(root_key)
707 def probe_path(self, path, revision=-1, debugging=None):
708 """If PATH exists in REVISION of the svn repository mirror,
709 return its leaf value, else return None.
710 If DEBUGGING is true, then print trace output to stdout.
711 REVISION defaults to youngest, and PATH must not start with '/'."""
712 components = string.split(path, '/')
713 if revision == -1:
714 revision = self.youngest
716 if debugging:
717 print "PROBING path: '%s' in %d" % (path, revision)
719 parent_key = self.revs_db[str(revision)]
720 parent = self.nodes_db[parent_key]
721 previous_component = "/"
723 i = 1
724 for component in components:
726 if debugging:
727 print " " * i,
728 print "'%s' key: %s, val:" % (previous_component, parent_key), parent
730 if not parent.has_key(component):
731 if debugging:
732 print " PROBE ABANDONED: '%s' does not contain '%s'" \
733 % (previous_component, component)
734 return None
736 this_entry_key = parent[component]
737 this_entry_val = self.nodes_db[this_entry_key]
738 parent_key = this_entry_key
739 parent = this_entry_val
740 previous_component = component
741 i = i + 1
743 if debugging:
744 print " " * i,
745 print "parent_key: %s, val:" % parent_key, parent
747 # It's not actually a parent at this point, it's the leaf node.
748 return parent
750 def change_path(self, path, tags, branches,
751 intermediate_dir_func=None,
752 copyfrom_path=None, copyfrom_rev=None,
753 expected_entries=None, only_if_already_exists=None):
754 """Record a change to PATH. PATH may not have a leading slash.
755 Return a Change instance representing the result of the
756 change.
758 TAGS are any tags that sprout from this revision of PATH, BRANCHES
759 are any branches that sprout from this revision of PATH.
761 If INTERMEDIATE_DIR_FUNC is not None, then invoke it once on
762 each full path to each missing intermediate directory in PATH, in
763 order from shortest to longest.
765 If COPYFROM_REV and COPYFROM_PATH are not None, then they are a
766 revision and path to record as the copyfrom sources of this node.
767 Since this implies an add (OP_ADD), it would be reasonable to
768 error and exit if the copyfrom args are present but the node also
769 already exists. Reasonable -- but not what we do :-). The most
770 useful behavior for callers is instead to report that nothing was
771 done, by returning OP_NOOP for Change.op, so that's what we do.
773 It is an error for only one copyfrom argument to be present.
775 If EXPECTED_ENTRIES is not None, then it holds entries expected
776 to be in the dst after the copy. Any entries in the new dst but
777 not in EXPECTED_ENTRIES are removed (ignoring keys beginning with
778 '/'), and the removed entries returned in Change.deleted_entries,
779 which are otherwise None.
781 No action is taken for keys in EXPECTED_ENTRIES but not in the
782 dst; it is assumed that the caller will compensate for these by
783 calling change_path again with other arguments.
785 If ONLY_IF_ALREADY_EXISTS is set, then do a no-op, rather than an add,
786 if the path does not exist. This is to allow pruning using EXPECTED_ENTRIES
787 without risking erroneously adding a path."""
789 # Check caller sanity.
790 if ((copyfrom_rev and not copyfrom_path) or
791 (copyfrom_path and not copyfrom_rev)):
792 sys.stderr.write("%s: change_path() called with one copyfrom "
793 "argument but not the other.\n" % error_prefix)
794 sys.exit(1)
796 components = string.split(path, '/')
797 path_so_far = None
799 deletions = []
800 in_pruneable_subtree = None
802 parent_key = self.revs_db[str(self.youngest)]
803 parent = self.nodes_db[parent_key]
804 if not parent.has_key(self.mutable_flag):
805 parent_key = gen_key()
806 parent[self.mutable_flag] = 1
807 self.nodes_db[parent_key] = parent
808 self.revs_db[str(self.youngest)] = parent_key
810 for component in components[:-1]:
811 # parent is always mutable at the top of the loop
813 if path_so_far:
814 path_so_far = path_so_far + '/' + component
815 else:
816 path_so_far = component
818 # Ensure that the parent has an entry for this component.
819 if not parent.has_key(component):
820 if only_if_already_exists:
821 return Change(OP_NOOP, [], [], deletions)
822 # else
823 new_child_key = gen_key()
824 parent[component] = new_child_key
825 self.nodes_db[new_child_key] = self.empty_mutable_thang
826 self.nodes_db[parent_key] = parent
827 if intermediate_dir_func:
828 intermediate_dir_func(path_so_far)
830 # One way or another, parent dir now has an entry for component,
831 # so grab it, see if it's mutable, and DTRT if it's not. (Note
832 # it's important to reread the entry value from the db, even
833 # though we might have just written it -- if we tweak existing
834 # data structures, we could modify self.empty_mutable_thang,
835 # which must not happen.)
836 this_entry_key = parent[component]
837 this_entry_val = self.nodes_db[this_entry_key]
838 mutable = this_entry_val.get(self.mutable_flag)
839 if not mutable:
840 this_entry_val[self.mutable_flag] = 1
841 this_entry_key = gen_key()
842 parent[component] = this_entry_key
843 self.nodes_db[this_entry_key] = this_entry_val
844 self.nodes_db[parent_key] = parent
845 elif mutable == 2:
846 in_pruneable_subtree = 1
848 parent_key = this_entry_key
849 parent = this_entry_val
851 # Now change the last node, the versioned file. Just like at the
852 # top of the above loop, parent is already mutable.
853 op = OP_ADD
854 if self.symroots_db.has_key(path):
855 old_names = self.symroots_db[path]
856 else:
857 old_names = [], []
858 last_component = components[-1]
859 new_val = { }
860 if parent.has_key(last_component):
861 # The contract for copying over existing nodes is to do nothing
862 # and return:
863 if copyfrom_path:
864 return Change(OP_NOOP, old_names[0], old_names[1], deletions)
865 # else
866 op = OP_CHANGE
867 new_val = self.nodes_db[parent[last_component]]
868 elif only_if_already_exists:
869 return Change(OP_NOOP, [], [], deletions)
871 leaf_key = gen_key()
872 if copyfrom_path:
873 new_val = self.probe_path(copyfrom_path, copyfrom_rev)
874 if new_val is None:
875 # Sometimes a branch is rooted in a revision that RCS has
876 # marked as 'dead'. There is no reason to assume that the
877 # current path shares any history with any older live parent
878 # of the dead revision, so we do nothing and return.
879 return Change(OP_NOOP, [], [], deletions)
880 # Special value of mutable flag indicates that this subtree was created
881 # by copying in this revision. Iff this is true, then it is valid to
882 # use expected_entries to prune items.
883 new_val[self.mutable_flag] = 2
884 in_pruneable_subtree = 1
885 else:
886 new_val[self.mutable_flag] = 1
887 if expected_entries is not None:
888 # If it is not None, then even if it is an empty list/tuple,
889 # we need to approve this item in its parent's approved entries list.
890 approved_entries = parent.get(self.approved_entries) or {}
891 approved_entries[last_component] = 1
892 parent[self.approved_entries] = approved_entries
893 if expected_entries:
894 approved_entries = new_val.get(self.approved_entries) or { }
895 new_approved_entries = { }
896 for ent in new_val.keys():
897 if (ent[0] != '/'):
898 if (not expected_entries.has_key(ent)
899 and not approved_entries.has_key(ent)):
900 if in_pruneable_subtree:
901 del new_val[ent]
902 deletions.append(ent)
903 else:
904 new_approved_entries[ent] = 1
905 new_val[self.approved_entries] = new_approved_entries
906 parent[last_component] = leaf_key
907 self.nodes_db[parent_key] = parent
908 self.symroots_db[path] = (tags, branches)
909 self.nodes_db[leaf_key] = new_val
911 return Change(op, old_names[0], old_names[1], deletions, copyfrom_rev)
913 def delete_path(self, path, tags, branches, prune=None):
914 """Delete PATH from the tree. PATH may not have a leading slash.
916 Return a tuple (path_deleted, closed_tags, closed_branches), where
917 path_deleted is the path actually deleted or None if PATH did not
918 exist, and closed_tags and closed_branches are lists of symbolic
919 names closed off by this deletion -- that is, tags or branches
920 which could be rooted in the previous revision of PATH, but not in
921 this revision, because this rev changes PATH. If path_deleted is
922 None, then closed_tags and closed_branches will both be empty.
924 TAGS are any tags that sprout from this revision of PATH, BRANCHES
925 are any branches that sprout from this revision of PATH. (I can't
926 imagine that there are any of either, what to do if there are?)
928 If PRUNE is not None, then delete the highest possible directory,
929 which means the returned path may differ from PATH. In other
930 words, if PATH was the last entry in its parent, then delete
931 PATH's parent, unless it too is the last entry in *its* parent, in
932 which case delete that parent, and so on up the chain, until a
933 directory is encountered that has an entry which is not a member
934 of the parent stack of the original target.
936 NOTE: This function does *not* allow you delete top-level entries
937 (like /trunk, /branches, /tags), not does it prune upwards beyond
938 those entries.
940 PRUNE is like the -P option to 'cvs checkout'."""
942 components = string.split(path, '/')
943 path_so_far = None
945 parent_key = self.revs_db[str(self.youngest)]
946 parent = self.nodes_db[parent_key]
948 # As we walk down to find the dest, we remember each parent
949 # directory's name and db key, in reverse order: push each new key
950 # onto the front of the list, so that by the time we reach the
951 # destination node, the zeroth item in the list is the parent of
952 # that destination.
954 # Then if we actually do the deletion, we walk the list from left
955 # to right, replacing as appropriate.
957 # The root directory has name None.
958 parent_chain = [ ]
959 parent_chain.insert(0, (None, parent_key))
961 def is_prunable(dir):
962 """Return true if DIR, a dictionary representing a directory,
963 has just zero or one non-special entry, else return false.
964 (In a pure world, we'd just ask len(DIR) > 1; it's only
965 because the directory might have mutable flags and other special
966 entries that we need this function at all.)"""
967 num_items = len(dir)
968 if num_items > 3:
969 return None
970 if num_items == 3 or num_items == 2:
971 real_entries = 0
972 for key in dir.keys():
973 if not key[0] == '/': real_entries = real_entries + 1
974 if real_entries > 1:
975 return None
976 else:
977 return 1
978 else:
979 return 1
981 # We never prune our top-level directories (/trunk, /tags, /branches)
982 if len(components) < 2:
983 return None, [], []
985 for component in components[:-1]:
986 if path_so_far:
987 path_so_far = path_so_far + '/' + component
988 else:
989 path_so_far = component
991 # If we can't reach the dest, then we don't need to do anything.
992 if not parent.has_key(component):
993 return None, [], []
995 # Otherwise continue downward, dropping breadcrumbs.
996 this_entry_key = parent[component]
997 this_entry_val = self.nodes_db[this_entry_key]
998 parent_key = this_entry_key
999 parent = this_entry_val
1000 parent_chain.insert(0, (component, parent_key))
1002 # If the target is not present in its parent, then we're done.
1003 last_component = components[-1]
1004 old_names = [], []
1005 if not parent.has_key(last_component):
1006 return None, [], []
1007 elif self.symroots_db.has_key(path):
1008 old_names = self.symroots_db[path]
1009 del self.symroots_db[path]
1011 # The target is present, so remove it and bubble up, making a new
1012 # mutable path and/or pruning as necessary.
1013 pruned_count = 0
1014 prev_entry_name = last_component
1015 new_key = None
1016 for parent_item in parent_chain:
1017 pkey = parent_item[1]
1018 pval = self.nodes_db[pkey]
1020 # If we're pruning at all, and we're looking at a prunable thing
1021 # (and that thing isn't one of our top-level directories --
1022 # trunk, tags, branches) ...
1023 if prune and (new_key is None) and is_prunable(pval) \
1024 and parent_item != parent_chain[-2]:
1025 # ... then up our count of pruned items, and do nothing more.
1026 # All the action takes place when we hit a non-prunable
1027 # parent.
1028 pruned_count = pruned_count + 1
1029 else:
1030 # Else, we've hit a non-prunable, or aren't pruning, so bubble
1031 # up the new gospel.
1032 pval[self.mutable_flag] = 1
1033 if new_key is None:
1034 del pval[prev_entry_name]
1035 else:
1036 pval[prev_entry_name] = new_key
1037 new_key = gen_key()
1039 prev_entry_name = parent_item[0]
1040 if new_key:
1041 self.nodes_db[new_key] = pval
1043 if new_key is None:
1044 new_key = gen_key()
1045 self.nodes_db[new_key] = self.empty_mutable_thang
1047 # Install the new root entry.
1048 self.revs_db[str(self.youngest)] = new_key
1050 # Sanity check -- this should be a "can't happen".
1051 if pruned_count > len(components):
1052 sys.stderr.write("%s: deleting '%s' tried to prune %d components.\n"
1053 % (error_prefix, path, pruned_count))
1054 sys.exit(1)
1056 if pruned_count:
1057 if pruned_count == len(components):
1058 # We never prune away the root directory, so back up one component.
1059 pruned_count = pruned_count - 1
1060 retpath = string.join(components[:0 - pruned_count], '/')
1061 else:
1062 retpath = path
1064 return retpath, old_names[0], old_names[1]
1066 ### We've no place to put tags + branches. Suspect we just
1067 ### shouldn't be taking them as arguments, which the doc string
1068 ### implies already. Ponder.
1070 def close(self):
1071 # Just stabilize the last revision. This may or may not affect
1072 # anything, but if we end up using the mirror for anything after
1073 # this, it's nice to know the '/mutable' entries are gone.
1074 self.stabilize_youngest()
1076 if sys.platform == "win32":
1077 def escape_shell_arg(str):
1078 return '"' + string.replace(str, '"', '"^""') + '"'
1079 else:
1080 def escape_shell_arg(str):
1081 return "'" + string.replace(str, "'", "'\\''") + "'"
1083 class Dumper:
1084 def __init__(self, ctx):
1085 'Open DUMPFILE_PATH, and initialize revision to REVISION.'
1086 self.dumpfile_path = ctx.dumpfile
1087 self.revision = 0
1088 self.repos_mirror = RepositoryMirror()
1089 self.svnadmin = ctx.svnadmin
1090 self.target = ctx.target
1091 self.dump_only = ctx.dump_only
1092 self.dumpfile = None
1093 self.path_encoding = ctx.encoding
1094 self.loader_pipe = None
1096 # If all we're doing here is dumping, we can go ahead and
1097 # initialize our single dumpfile. Else, if we're suppose to
1098 # create the repository, do so.
1099 if self.dump_only:
1100 self.init_dumpfile()
1101 self.write_dumpfile_header(self.dumpfile)
1102 else:
1103 if not ctx.existing_svnrepos:
1104 print "creating repos '%s'" % (self.target)
1105 run_command('%s create %s %s' % (self.svnadmin, ctx.bdb_txn_nosync
1106 and "--bdb-txn-nosync" or "", self.target))
1107 self.loader_pipe = os.popen('%s load -q %s' %
1108 (self.svnadmin, self.target), PIPE_WRITE_MODE)
1109 self.write_dumpfile_header(self.loader_pipe)
1112 def init_dumpfile(self):
1113 # Open the dumpfile for binary-mode write.
1114 self.dumpfile = open(self.dumpfile_path, 'wb')
1117 def write_dumpfile_header(self, fileobj):
1118 # Initialize the dumpfile with the standard headers:
1120 # The CVS repository doesn't have a UUID, and the Subversion
1121 # repository will be created with one anyway. So when we load
1122 # the dumpfile, we don't specify a UUID.
1123 fileobj.write('SVN-fs-dump-format-version: 2\n\n')
1125 def flush_and_remove_dumpfile(self):
1126 if self.dumpfile is None:
1127 return
1128 self.dumpfile.close()
1129 print "piping revision %d into '%s' loader" % (self.revision, self.target)
1130 dumpfile = open(self.dumpfile_path, 'rb')
1131 while 1:
1132 data = dumpfile.read(1024*1024) # Choice of 1MB chunks is arbitrary
1133 if not len(data): break
1134 self.loader_pipe.write(data)
1135 dumpfile.close()
1137 os.remove(self.dumpfile_path)
1139 def start_revision(self, props):
1140 """Write the next revision, with properties, to the dumpfile.
1141 Return the newly started revision."""
1143 # If this is not a --dump-only, we need to flush (load into the
1144 # repository) any dumpfile data we have already written and the
1145 # init a new dumpfile before starting this revision.
1147 if not self.dump_only:
1148 if self.revision > 0:
1149 self.flush_and_remove_dumpfile()
1150 self.init_dumpfile()
1152 self.revision = self.revision + 1
1154 # A revision typically looks like this:
1156 # Revision-number: 1
1157 # Prop-content-length: 129
1158 # Content-length: 129
1160 # K 7
1161 # svn:log
1162 # V 27
1163 # Log message for revision 1.
1164 # K 10
1165 # svn:author
1166 # V 7
1167 # jrandom
1168 # K 8
1169 # svn:date
1170 # V 27
1171 # 2003-04-22T22:57:58.132837Z
1172 # PROPS-END
1174 # Notice that the length headers count everything -- not just the
1175 # length of the data but also the lengths of the lengths, including
1176 # the 'K ' or 'V ' prefixes.
1178 # The reason there are both Prop-content-length and Content-length
1179 # is that the former includes just props, while the latter includes
1180 # everything. That's the generic header form for any entity in a
1181 # dumpfile. But since revisions only have props, the two lengths
1182 # are always the same for revisions.
1184 # Calculate the total length of the props section.
1185 total_len = 10 # len('PROPS-END\n')
1186 for propname in props.keys():
1187 klen = len(propname)
1188 klen_len = len('K %d' % klen)
1189 vlen = len(props[propname])
1190 vlen_len = len('V %d' % vlen)
1191 # + 4 for the four newlines within a given property's section
1192 total_len = total_len + klen + klen_len + vlen + vlen_len + 4
1194 # Print the revision header and props
1195 self.dumpfile.write('Revision-number: %d\n'
1196 'Prop-content-length: %d\n'
1197 'Content-length: %d\n'
1198 '\n'
1199 % (self.revision, total_len, total_len))
1201 for propname in props.keys():
1202 self.dumpfile.write('K %d\n'
1203 '%s\n'
1204 'V %d\n'
1205 '%s\n' % (len(propname),
1206 propname,
1207 len(props[propname]),
1208 props[propname]))
1210 self.dumpfile.write('PROPS-END\n')
1211 self.dumpfile.write('\n')
1213 self.repos_mirror.new_revision()
1214 return self.revision
1216 def add_dir(self, path):
1217 self.dumpfile.write("Node-path: %s\n"
1218 "Node-kind: dir\n"
1219 "Node-action: add\n"
1220 "Prop-content-length: 10\n"
1221 "Content-length: 10\n"
1222 "\n"
1223 "PROPS-END\n"
1224 "\n"
1225 "\n" % self.utf8_path(path))
1227 def utf8_path(self, path):
1228 """Return UTF-8 encoded 'path' based on ctx.path_encoding."""
1229 try:
1230 ### Log messages can be converted with 'replace' strategy.
1231 ### We can't afford that here.
1232 unicode_path = unicode(path, self.path_encoding, 'strict')
1233 return unicode_path.encode('utf-8')
1235 except UnicodeError:
1236 print "Unable to convert a path '%s' to internal encoding." % path
1237 print "Consider rerunning with (for example) '--encoding=latin1'"
1238 sys.exit(1)
1241 def probe_path(self, path):
1242 """Return true if PATH exists in the youngest tree of the svn
1243 repository, else return None. PATH does not start with '/'."""
1244 if self.repos_mirror.probe_path(path) is None:
1245 return None
1246 else:
1247 return 1
1249 def copy_path(self, svn_src_path, svn_src_rev, svn_dst_path, entries=None):
1250 """If it wouldn't be redundant to do so, emit a copy of SVN_SRC_PATH at
1251 SVN_SRC_REV to SVN_DST_PATH.
1253 Return 1 if the copy was done, None otherwise.
1255 If ENTRIES is not None, it is a dictionary whose keys are the full
1256 set of entries the new copy is expected to have -- and therefore
1257 any entries in the new dst but not in ENTRIES will be removed.
1258 (Keys in ENTRIES beginning with '/' are ignored.)
1260 No action is taken for keys in ENTRIES but not in the dst; it is
1261 assumed that the caller will compensate for these by calling
1262 copy_path again with other arguments."""
1263 change = self.repos_mirror.change_path(svn_dst_path,
1264 [], [],
1265 self.add_dir,
1266 svn_src_path, svn_src_rev,
1267 entries)
1268 if change.op == OP_ADD:
1269 if change.copyfrom_rev >= self.revision:
1270 sys.stderr.write("%s: invalid copyfrom revision %d used while\n"
1271 "creating revision %d in dumpfile.\n"
1272 % (error_prefix, change.copyfrom_rev, self.revision))
1273 sys.exit(1)
1275 # We don't need to include "Node-kind:" for copies; the loader
1276 # ignores it anyway and just uses the source kind instead.
1277 self.dumpfile.write('Node-path: %s\n'
1278 'Node-action: add\n'
1279 'Node-copyfrom-rev: %d\n'
1280 'Node-copyfrom-path: /%s\n'
1281 '\n'
1282 % (self.utf8_path(svn_dst_path),
1283 change.copyfrom_rev,
1284 self.utf8_path(svn_src_path)))
1286 for ent in change.deleted_entries:
1287 self.dumpfile.write('Node-path: %s\n'
1288 'Node-action: delete\n'
1289 '\n' % (self.utf8_path(svn_dst_path + '/' + ent)))
1290 return 1
1291 return None
1293 def prune_entries(self, path, expected):
1294 """Delete any entries in PATH that are not in list EXPECTED.
1295 PATH need not be a directory, but of course nothing will happen if
1296 it's a file. Entries beginning with '/' are ignored as usual."""
1297 change = self.repos_mirror.change_path(path,
1298 [], [],
1299 self.add_dir,
1300 None, None,
1301 expected, 1)
1302 for ent in change.deleted_entries:
1303 self.dumpfile.write('Node-path: %s\n'
1304 'Node-action: delete\n'
1305 '\n' % (self.utf8_path(path + '/' + ent)))
1307 def add_or_change_path(self, cvs_path, svn_path, cvs_rev, rcs_file,
1308 tags, branches, ctx):
1309 # figure out the real file path for "co"
1310 try:
1311 f_st = os.stat(rcs_file)
1312 except os.error:
1313 dirname, fname = os.path.split(rcs_file)
1314 rcs_file = os.path.join(dirname, 'Attic', fname)
1315 f_st = os.stat(rcs_file)
1317 # We begin with only a "CVS revision" property.
1318 if ctx.cvs_revnums:
1319 prop_contents = 'K 15\ncvs2svn:cvs-rev\nV %d\n%s\n' \
1320 % (len(cvs_rev), cvs_rev)
1321 else:
1322 prop_contents = ''
1324 # Check for executable-ness.
1325 if f_st[0] & stat.S_IXUSR:
1326 prop_contents = prop_contents + 'K 14\nsvn:executable\nV 1\n*\n'
1328 # Set MIME type, and maybe eol-style for text files.
1329 if ctx.mime_mapper:
1330 mime_type = ctx.mime_mapper.get_type_from_filename(cvs_path)
1331 if mime_type:
1332 prop_contents = prop_contents + ('K 13\nsvn:mime-type\nV %d\n%s\n' % \
1333 (len(mime_type), mime_type))
1334 if ctx.set_eol_style and mime_type.startswith("text/"):
1335 prop_contents = prop_contents + 'K 13\nsvn:eol-style\nV 6\nnative\n'
1337 # Calculate the property length (+10 for "PROPS-END\n")
1338 props_len = len(prop_contents) + 10
1340 ### FIXME: We ought to notice the -kb flag set on the RCS file and
1341 ### use it to set svn:mime-type.
1342 ### (How this will interact with the mime-mapper code
1343 ### has yet to be decided.)
1345 basename = os.path.basename(rcs_file[:-2])
1346 pipe_cmd = 'co -q -x,v -p%s %s' % (cvs_rev, escape_shell_arg(rcs_file))
1347 pipe = os.popen(pipe_cmd, PIPE_READ_MODE)
1349 # You might think we could just test
1351 # if cvs_rev[-2:] == '.1':
1353 # to determine if this path exists in head yet. But that wouldn't
1354 # be perfectly reliable, both because of 'cvs commit -r', and also
1355 # the possibility of file resurrection.
1356 change = self.repos_mirror.change_path(svn_path, tags, branches,
1357 self.add_dir)
1359 if change.op == OP_ADD:
1360 action = 'add'
1361 else:
1362 action = 'change'
1364 self.dumpfile.write('Node-path: %s\n'
1365 'Node-kind: file\n'
1366 'Node-action: %s\n'
1367 'Prop-content-length: %d\n'
1368 'Text-content-length: '
1369 % (self.utf8_path(svn_path), action, props_len))
1371 pos = self.dumpfile.tell()
1373 self.dumpfile.write('0000000000000000\n'
1374 'Text-content-md5: 00000000000000000000000000000000\n'
1375 'Content-length: 0000000000000000\n'
1376 '\n')
1378 self.dumpfile.write(prop_contents + 'PROPS-END\n')
1380 # Insert the rev contents, calculating length and checksum as we go.
1381 checksum = md5.new()
1382 length = 0
1383 buf = pipe.read()
1384 while buf:
1385 checksum.update(buf)
1386 length = length + len(buf)
1387 self.dumpfile.write(buf)
1388 buf = pipe.read()
1389 if pipe.close() is not None:
1390 sys.exit('%s: Command failed: "%s"' % (error_prefix, pipe_cmd))
1392 # Go back to patch up the length and checksum headers:
1393 self.dumpfile.seek(pos, 0)
1394 # We left 16 zeros for the text length; replace them with the real
1395 # length, padded on the left with spaces:
1396 self.dumpfile.write('%16d' % length)
1397 # 16... + 1 newline + len('Text-content-md5: ') == 35
1398 self.dumpfile.seek(pos + 35, 0)
1399 self.dumpfile.write(checksum.hexdigest())
1400 # 35... + 32 bytes of checksum + 1 newline + len('Content-length: ') == 84
1401 self.dumpfile.seek(pos + 84, 0)
1402 # The content length is the length of property data, text data,
1403 # and any metadata around/inside around them.
1404 self.dumpfile.write('%16d' % (length + props_len))
1405 # Jump back to the end of the stream
1406 self.dumpfile.seek(0, 2)
1408 # This record is done (write two newlines -- one to terminate
1409 # contents that weren't themselves newline-termination, one to
1410 # provide a blank line for readability.
1411 self.dumpfile.write('\n\n')
1412 return change.closed_tags, change.closed_branches
1414 def delete_path(self, svn_path, tags, branches, prune=None):
1415 """If SVN_PATH exists in the head mirror, output the deletion to
1416 the dumpfile, else output nothing to the dumpfile.
1418 Return a tuple (path_deleted, closed_tags, closed_branches), where
1419 path_deleted is the path deleted if any or None if no deletion was
1420 necessary, and closed_tags and closed_names are lists of symbolic
1421 names closed off by this deletion -- that is, tags or branches
1422 which could be rooted in the previous revision of PATH, but not in
1423 this revision, because this rev changes PATH. If path_deleted is
1424 None, then closed_tags and closed_branches will both be empty.
1426 Iff PRUNE is true, then the path deleted can be not None, yet
1427 shorter than SVN_PATH because of pruning."""
1428 deleted_path, closed_tags, closed_branches \
1429 = self.repos_mirror.delete_path(svn_path, tags,
1430 branches, prune)
1431 if deleted_path:
1432 print " (deleted '%s')" % deleted_path
1433 self.dumpfile.write('Node-path: %s\n'
1434 'Node-action: delete\n'
1435 '\n' % self.utf8_path(deleted_path))
1436 return deleted_path, closed_tags, closed_branches
1438 def close(self):
1439 self.repos_mirror.close()
1441 # If we're only making a dumpfile, we should be done now. Just
1442 # close the dumpfile. Otherwise, we're in "incremental" mode, and
1443 # we need to close our incremental dumpfile, flush it to the
1444 # repository, and then remove it.
1445 if self.dump_only:
1446 self.dumpfile.close()
1447 else:
1448 self.flush_and_remove_dumpfile()
1449 ret = self.loader_pipe.close()
1450 if ret:
1451 sys.stderr.write('%s: svnadmin load exited with error code %s' %
1452 (error_prefix, ret))
1453 sys.exit(1)
1456 def format_date(date):
1457 """Return an svn-compatible date string for DATE (seconds since epoch)."""
1458 # A Subversion date looks like "2002-09-29T14:44:59.000000Z"
1459 return time.strftime("%Y-%m-%dT%H:%M:%S.000000Z", time.gmtime(date))
1462 def make_revision_props(ctx, symbolic_name, is_tag, date=None):
1463 """Return a dictionary of revision properties for the manufactured
1464 commit that finished SYMBOLIC_NAME. If IS_TAG is true, write the
1465 log message as though for a tag, else as though for a branch.
1466 If DATE is passed, use it as the value of the svn:date property."""
1467 if is_tag:
1468 type = 'tag'
1469 else:
1470 type = 'branch'
1472 # In Python 2.2.3, we could use textwrap.fill(). Oh well :-).
1473 if len(symbolic_name) >= 13:
1474 space_or_newline = '\n'
1475 else:
1476 space_or_newline = ' '
1478 log = "This commit was manufactured by cvs2svn to create %s%s'%s'." \
1479 % (type, space_or_newline, symbolic_name)
1481 return { 'svn:author' : ctx.username,
1482 'svn:log' : log,
1483 'svn:date' : date or format_date(time.time())}
1486 class SymbolicNameTracker:
1487 """Track the Subversion path/revision ranges of CVS symbolic names.
1488 This is done in a .db file, representing a tree in the usual way.
1489 In addition to directory entries, each object in the database stores
1490 the earliest revision from which it could be copied, and the first
1491 revision from which it could no longer be copied. Intermediate
1492 directories go one step farther: they record counts for the various
1493 revisions from which items under them could have been copied, and
1494 counts for the cutoff revisions. For example:
1496 .----------.
1497 | sub1 | [(2, 1), (3, 3)]
1498 | / | [(5, 1), (17, 2), (50, 1)]
1499 | / |
1500 |/ sub2 |
1501 / \ |
1502 /|_____\____|
1503 / \
1504 ______/ \_________
1505 / \
1506 / \
1507 / \
1508 .---------. .---------.
1509 | file1 | | file3 |
1510 | / | [(3, 2)] | \ | [(2, 1), (3, 1)]
1511 | / | [(17, 1), (50, 1)] | \ | [(5, 1), (10, 1)]
1512 | / | | \ |
1513 |/ file2 | | file4 \|
1514 / \ | | / \
1515 /|_____\___| |___/_____|\
1516 / \ / \
1517 / \ / \
1518 / \ / \
1519 / + / +
1520 +======+ | +======+ |
1521 | | [(3, 1)] | | | [(2, 1)] |
1522 | | [(17, 1)] | | | [(5, 1)] |
1523 | | | | | |
1524 +======+ | +======+ |
1525 +======+ +======+
1526 | | [(3, 1)] | | [(3, 1)]
1527 | | [(50, 1)] | | [(17, 1)]
1528 | | | |
1529 +======+ +======+
1531 The two lists to the right of each node represent the 'opening' and
1532 'closing' revisions respectively. Each tuple in a list is of the
1533 form (REV, COUNT). For leaf nodes, COUNT is always 1, of course.
1534 For intermediate nodes, the counts are the sums of the corresponding
1535 counts of child nodes.
1537 These revision scores are used to determine the optimal copy
1538 revisions for each tree/subtree at branch or tag creation time.
1540 The svn path input will most often be a trunk path, because the
1541 path/rev information recorded here is about where and when the given
1542 symbolic name could be rooted, *not* a path/rev for which commits
1543 along that symbolic name take place (of course, commits only happen on
1544 branches anyway)."""
1546 def __init__(self):
1547 self.db_file = SYMBOLIC_NAMES_DB
1548 self.db = Database(self.db_file, 'n')
1549 self.root_key = gen_key()
1550 self.db[self.root_key] = {}
1552 # The keys for the opening and closing revision lists attached to
1553 # each directory or file. Includes "/" so as never to conflict
1554 # with any real entry.
1555 self.tags_opening_revs_key = "/tag-openings"
1556 self.tags_closing_revs_key = "/tag-closings"
1557 self.br_opening_revs_key = "/br-openings"
1558 self.br_closing_revs_key = "/br-closings"
1560 # When a node is copied into the repository, the revision copied
1561 # is stored under the appropriate key, and the corresponding
1562 # opening and closing rev lists are removed.
1563 self.tags_copyfrom_rev_key = "/tags-copyfrom-rev"
1564 self.br_copyfrom_rev_key = "/br-copyfrom-rev"
1566 def probe_path(self, symbolic_name, path, debugging=None):
1567 """If 'SYMBOLIC_NAME/PATH' exists in the symbolic name tree,
1568 return the value of its last component, else return None.
1569 PATH may be None, but may not start with '/'.
1570 If DEBUGGING is true, then print trace output to stdout."""
1571 if path:
1572 components = [symbolic_name] + string.split(path, '/')
1573 else:
1574 components = [symbolic_name]
1576 if debugging:
1577 print "PROBING SYMBOLIC NAME:\n", components
1579 parent_key = self.root_key
1580 parent = self.db[parent_key]
1581 last_component = "/"
1582 i = 1
1583 for component in components:
1584 if debugging:
1585 print " " * i,
1586 print "'%s' key: %s, val:" % (last_component, parent_key), parent
1588 # Check for a "can't happen."
1589 if not parent.has_key(component):
1590 sys.stderr.write("%s: sym probe failed: '%s' does not contain '%s'\n"
1591 % (error_prefix, last_component, component))
1592 sys.exit(1)
1594 this_entry_key = parent[component]
1595 this_entry_val = self.db[this_entry_key]
1596 parent_key = this_entry_key
1597 parent = this_entry_val
1598 last_component = component
1599 i = i + 1
1601 if debugging:
1602 print " " * i,
1603 print "parent_key: %s, val:" % parent_key, parent
1605 # It's not actually a parent at this point, it's the leaf node.
1606 return parent
1608 def bump_rev_count(self, item_key, rev, revlist_key):
1609 """Increment REV's count in opening or closing list under KEY.
1610 REVLIST_KEY is self.*_opening_revs_key or self.*_closing_revs_key,
1611 and indicates which rev list to increment REV's count in.
1613 For example, if REV is 7, REVLIST_KEY is
1614 self.tags_opening_revs_key, and the entry's tags opening revs list
1615 looks like this
1617 [(2, 5), (7, 2), (10, 15)]
1619 then afterwards it would look like this:
1621 [(2, 5), (7, 3), (10, 15)]
1623 But if no tuple for revision 7 were present, then one would be
1624 added, for example
1626 [(2, 5), (10, 15)]
1628 would become
1630 [(2, 5), (7, 1), (10, 15)]
1632 The list is sorted by ascending revision both before and after."""
1634 entry_val = self.db[item_key]
1636 if not entry_val.has_key(revlist_key):
1637 entry_val[revlist_key] = [(rev, 1)]
1638 else:
1639 rev_counts = entry_val[revlist_key]
1640 for i in range(len(rev_counts)):
1641 this_rev, this_count = rev_counts[i]
1642 if rev == this_rev:
1643 rev_counts[i] = (this_rev, this_count + 1)
1644 break
1645 elif this_rev > rev:
1646 if i > 0:
1647 i = i - 1
1648 rev_counts.insert(i, (rev, 1))
1649 break
1650 else:
1651 rev_counts.append((rev, 1))
1652 entry_val[revlist_key] = rev_counts
1654 self.db[item_key] = entry_val
1656 # The verb form of "root" is "root", but that would be misleading in
1657 # this case; and the opposite of "uproot" is presumably "downroot",
1658 # but that wouldn't exactly clarify either. Hence, "enroot" :-).
1659 def enroot_names(self, svn_path, svn_rev, names, opening_key):
1660 """Record SVN_PATH at SVN_REV as the earliest point from which the
1661 symbolic names in NAMES could be copied. OPENING_KEY is
1662 self.tags_opening_revs_key or self.br_opening_revs_key, to
1663 indicate whether NAMES contains tag names or branch names.
1664 SVN_PATH does not start with '/'."""
1666 # Guard against names == None
1667 if not names:
1668 return
1670 for name in names:
1671 components = [name] + string.split(svn_path, '/')
1672 parent_key = self.root_key
1673 for component in components:
1674 self.bump_rev_count(parent_key, svn_rev, opening_key)
1675 parent = self.db[parent_key]
1676 if not parent.has_key(component):
1677 new_child_key = gen_key()
1678 parent[component] = new_child_key
1679 self.db[new_child_key] = {}
1680 self.db[parent_key] = parent
1681 # One way or another, parent now has an entry for component.
1682 this_entry_key = parent[component]
1683 this_entry_val = self.db[this_entry_key]
1684 # Swaparoo.
1685 parent_key = this_entry_key
1686 parent = this_entry_val
1688 self.bump_rev_count(parent_key, svn_rev, opening_key)
1690 def enroot_tags(self, svn_path, svn_rev, tags):
1691 """Record SVN_PATH at SVN_REV as the earliest point from which the
1692 symbolic names in TAGS could be copied. SVN_PATH does not start
1693 with '/'."""
1694 self.enroot_names(svn_path, svn_rev, tags, self.tags_opening_revs_key)
1696 def enroot_branches(self, svn_path, svn_rev, branches):
1697 """Record SVN_PATH at SVN_REV as the earliest point from which the
1698 symbolic names in BRANCHES could be copied. SVN_PATH does not
1699 start with '/'."""
1700 self.enroot_names(svn_path, svn_rev, branches, self.br_opening_revs_key)
1702 def close_names(self, svn_path, svn_rev, names, closing_key):
1703 """Record that as of SVN_REV, SVN_PATH could no longer be the
1704 source from which any of symbolic names in NAMES could be copied.
1705 CLOSING_KEY is self.tags_closing_revs_key or
1706 self.br_closing_revs_key, to indicate whether NAMES are tags or
1707 branches. SVN_PATH does not start with '/'."""
1709 # Guard against names == None
1710 if not names:
1711 return
1713 for name in names:
1714 components = [name] + string.split(svn_path, '/')
1715 parent_key = self.root_key
1716 for component in components:
1717 self.bump_rev_count(parent_key, svn_rev, closing_key)
1718 parent = self.db[parent_key]
1719 # Check for a "can't happen".
1720 if not parent.has_key(component):
1721 sys.stderr.write("%s: in path '%s', value for parent key '%s' "
1722 "does not have entry '%s'\n"
1723 % (error_prefix, svn_path, parent_key, component))
1724 sys.exit(1)
1725 this_entry_key = parent[component]
1726 this_entry_val = self.db[this_entry_key]
1727 # Swaparoo.
1728 parent_key = this_entry_key
1729 parent = this_entry_val
1731 self.bump_rev_count(parent_key, svn_rev, closing_key)
1733 def close_tags(self, svn_path, svn_rev, tags):
1734 """Record that as of SVN_REV, SVN_PATH could no longer be the
1735 source from which any of TAGS could be copied. SVN_PATH does not
1736 start with '/'."""
1737 self.close_names(svn_path, svn_rev, tags, self.tags_closing_revs_key)
1739 def close_branches(self, svn_path, svn_rev, branches):
1740 """Record that as of SVN_REV, SVN_PATH could no longer be the
1741 source from which any of BRANCHES could be copied. SVN_PATH does
1742 not start with '/'."""
1743 self.close_names(svn_path, svn_rev, branches, self.br_closing_revs_key)
1745 def score_revisions(self, openings, closings):
1746 """Return a list of revisions and scores based on OPENINGS and
1747 CLOSINGS. The returned list looks like:
1749 [(REV1 SCORE1), (REV2 SCORE2), ...]
1751 where REV2 > REV1. OPENINGS and CLOSINGS are the values of
1752 self.tags_opening_revs_key and self.tags_closing_revs_key, or
1753 self.br_opening_revs_key and self.br_closing_revs_key, from some file or
1754 directory node, or else None.
1756 Each score indicates that copying the corresponding revision (or any
1757 following revision up to the next revision in the list) of
1758 the object in question would yield that many correct paths at or
1759 underneath the object. There may be other paths underneath it
1760 which are not correct and need to be deleted or recopied; those
1761 can only be detected by descending and examining their scores.
1763 If OPENINGS is false, return the empty list."""
1765 # First look for easy outs.
1766 if not openings:
1767 return []
1769 # Must be able to call len(closings) below.
1770 if closings is None:
1771 closings = []
1773 # No easy out, so wish for lexical closures and calculate the scores :-).
1774 scores = []
1775 opening_score_accum = 0
1776 for i in range(len(openings)):
1777 opening_rev, opening_score = openings[i]
1778 opening_score_accum = opening_score_accum + opening_score
1779 scores.append((opening_rev, opening_score_accum))
1780 min = 0
1781 for i in range(len(closings)):
1782 closing_rev, closing_score = closings[i]
1783 done_exact_rev = None
1784 insert_index = None
1785 insert_score = None
1786 for j in range(min, len(scores)):
1787 score_rev, score = scores[j]
1788 if score_rev >= closing_rev:
1789 if not done_exact_rev:
1790 if score_rev > closing_rev:
1791 insert_index = j
1792 insert_score = scores[j-1][1] - closing_score
1793 done_exact_rev = 1
1794 scores[j] = (score_rev, score - closing_score)
1795 else:
1796 min = j + 1
1797 if not done_exact_rev:
1798 scores.append((closing_rev,scores[-1][1] - closing_score))
1799 if insert_index is not None:
1800 scores.insert(insert_index, (closing_rev, insert_score))
1801 return scores
1803 def best_rev(self, scores, prefer_rev, limit_rev):
1804 """Return the revision older than LIMIT_REV with the highest score
1805 from SCORES, a list returned by score_revisions(). When the maximum score
1806 is shared by multiple revisions, the oldest revision is selected, unless
1807 PREFER_REV is one of the possibilities, in which case, it is selected."""
1808 max_score = 0
1809 prefer_rev_score = -1
1810 rev = SVN_INVALID_REVNUM
1811 for pair in scores:
1812 if pair[1] > max_score and pair[0] < limit_rev:
1813 max_score = pair[1]
1814 rev = pair[0]
1815 if pair[0] <= prefer_rev:
1816 prefer_rev_score = pair[1]
1817 if prefer_rev_score == max_score:
1818 rev = prefer_rev
1819 return rev
1821 def is_best_rev(self, scores, rev, limit_rev):
1822 """Return true if REV has the highest score for revisions older than
1823 LIMIT_REV from SCORES, a list returned by score_revisions()."""
1824 return self.best_rev(scores, rev, limit_rev) == rev
1826 # Helper for copy_descend().
1827 def cleanup_entries(self, rev, limit_rev, entries, is_tag):
1828 """Return a copy of ENTRIES, minus the individual entries whose
1829 highest scoring revision doesn't match REV (and also, minus and
1830 special '/'-denoted flags). IS_TAG is 1 or None, based on whether
1831 this work is being done for the sake of a tag or a branch."""
1832 if is_tag:
1833 opening_key = self.tags_opening_revs_key
1834 closing_key = self.tags_closing_revs_key
1835 else:
1836 opening_key = self.br_opening_revs_key
1837 closing_key = self.br_closing_revs_key
1839 new_entries = {}
1840 for key in entries.keys():
1841 if key[0] == '/': # Skip flags
1842 continue
1843 entry = entries.get(key)
1844 val = self.db[entry]
1845 scores = self.score_revisions(val.get(opening_key), val.get(closing_key))
1846 if self.is_best_rev(scores, rev, limit_rev):
1847 new_entries[key] = entry
1848 return new_entries
1850 # Helper for fill_branch().
1851 def copy_descend(self, dumper, ctx, name, parent, entry_name,
1852 parent_rev, src_path, dst_path, is_tag, jit_new_rev=None):
1853 """Starting with ENTRY_NAME in directory object PARENT at
1854 PARENT_REV, use DUMPER and CTX to copy nodes in the Subversion
1855 repository, manufacturing the source paths with SRC_PATH and the
1856 destination paths with NAME and DST_PATH.
1858 If IS_TAG is true, NAME is treated as a tag, else as a branch.
1860 If JIT_NEW_REV is not None, it is a list of one or two elements.
1861 If the first element is true, then if any copies are to be made,
1862 invoke DUMPER.start_revision() before the first copy, then set
1863 JIT_NEW_REV[0] to None, so no more new revisions are made for this
1864 symbolic name anywhere in this descent.
1866 The second element, if present, is the string to be used for the svn:date
1867 property of any JIT-created revision.
1869 ('JIT' == 'Just In Time'.)"""
1870 ### Hmmm, is passing [1] instead of 1 an idiomatic way of passing
1871 ### a side-effectable boolean in Python? That's how the
1872 ### JIT_NEW_REV parameter works here and elsewhere, but maybe
1873 ### there's a clearer way to do it?
1875 key = parent[entry_name]
1876 val = self.db[key]
1878 if is_tag:
1879 opening_key = self.tags_opening_revs_key
1880 closing_key = self.tags_closing_revs_key
1881 copyfrom_rev_key = self.tags_copyfrom_rev_key
1882 else:
1883 opening_key = self.br_opening_revs_key
1884 closing_key = self.br_closing_revs_key
1885 copyfrom_rev_key = self.br_copyfrom_rev_key
1887 limit_rev = dumper.revision
1888 if jit_new_rev and jit_new_rev[0]:
1889 # Because in this case the current rev is complete,
1890 # so is a valid copyfrom source
1891 limit_rev = limit_rev + 1
1893 if not val.has_key(copyfrom_rev_key):
1894 # If not already copied this subdir, calculate its "best rev"
1895 # and see if it differs from parent's best rev.
1896 scores = self.score_revisions(val.get(opening_key), val.get(closing_key))
1897 rev = self.best_rev(scores, parent_rev, limit_rev)
1899 if rev == SVN_INVALID_REVNUM:
1900 return # name is a branch, but we're doing a tag, or vice versa
1902 else:
1903 if is_tag:
1904 copy_dst = make_path(ctx, dst_path, None, name)
1905 else:
1906 copy_dst = make_path(ctx, dst_path, name, None)
1908 expected_entries = self.cleanup_entries(rev, limit_rev,
1909 val, is_tag)
1910 if (rev != parent_rev):
1911 if jit_new_rev and jit_new_rev[0]:
1912 dumper.start_revision(make_revision_props(ctx, name, is_tag,
1913 len(jit_new_rev) > 1 and jit_new_rev[1] or None))
1914 jit_new_rev[0] = None
1915 if dumper.copy_path(src_path, rev, copy_dst, expected_entries):
1916 parent_rev = rev
1917 else:
1918 # If we didn't copy, then we need to prune
1919 dumper.prune_entries(copy_dst, expected_entries)
1920 else:
1921 # Even if we kept the already-present revision of this entry
1922 # instead of copying a new one, we still need to prune out
1923 # anything that's not part of the symbolic name.
1924 dumper.prune_entries(copy_dst, expected_entries)
1926 # Record that this copy is done:
1927 val[copyfrom_rev_key] = parent_rev
1928 if val.has_key(opening_key):
1929 del val[opening_key]
1930 if val.has_key(closing_key):
1931 del val[closing_key]
1932 self.db[key] = val
1934 for ent in val.keys():
1935 if not ent[0] == '/':
1936 if src_path:
1937 next_src = src_path + '/' + ent
1938 else:
1939 next_src = ent
1940 if dst_path:
1941 next_dst = dst_path + '/' + ent
1942 else:
1943 next_dst = ent
1944 self.copy_descend(dumper, ctx, name, val, ent, parent_rev,
1945 next_src, next_dst, is_tag, jit_new_rev)
1947 def fill_name(self, dumper, ctx, name, is_tag, jit_new_rev=None):
1948 """Use DUMPER to create all currently available parts of symbolic
1949 name NAME that have not been created already.
1951 If IS_TAG is true, NAME is treated as a tag, else as a branch.
1953 JIT_NEW_REV is as documented for the copy_descend() function."""
1955 # A source path looks like this in the symbolic name tree:
1957 # thisbranch/trunk/proj/foo/bar/baz.c
1959 # ...or occasionally...
1961 # thisbranch/branches/sourcebranch/proj/foo/bar/baz.c
1963 # (the latter when 'thisbranch' is branched off 'sourcebranch').
1965 # Meanwhile, we're copying to a location in the repository like
1967 # /branches/thisbranch/proj/foo/bar/baz.c or
1968 # /tags/tagname/proj/foo/bar/baz.c
1970 # Of course all this depends on make_path()'s behavior. At
1971 # various times we've changed the way it produces paths (see
1972 # revisions 6028 and 6347). If it changes again, the logic here
1973 # must be adjusted to match.
1975 parent_key = self.root_key
1976 parent = self.db[parent_key]
1978 # If there are no origin records, then we must've messed up earlier.
1979 if not parent.has_key(name):
1980 if is_tag:
1981 sys.stderr.write("%s: no origin records for tag '%s'.\n"
1982 % (error_prefix, name))
1983 else:
1984 sys.stderr.write("%s: no origin records for branch '%s'.\n"
1985 % (error_prefix, name))
1986 sys.exit(1)
1988 parent_key = parent[name]
1989 parent = self.db[parent_key]
1991 # All Subversion source paths under the branch start with one of
1992 # three things:
1994 # /trunk/...
1995 # /branches/foo/...
1996 # /tags/foo/...
1998 # (We don't care what foo is, it's just a component to skip over.)
2000 # Since these don't all have the same number of components, we
2001 # manually descend into each as far as necessary, then invoke
2002 # copy_descend() once we're in the right place in both trees.
2004 # Since it's possible for a branch or tag to have some source
2005 # paths on trunk and some on branches, there's some question about
2006 # what to copy as the top-level directory of the branch. Our
2007 # solution is to [somewhat randomly] give preference to trunk.
2008 # Note that none of these paths can ever conflict; for example,
2009 # it would be impossible to have both
2011 # thisbranch/trunk/myproj/lib/drivers.c and
2012 # thisbranch/branches/sourcebranch/myproj/lib/drivers.c
2014 # because that would imply that the symbolic name 'thisbranch'
2015 # appeared twice in the RCS file header, referring to two
2016 # different revisions. Well, I suppose that's *possible*, but its
2017 # effect is undefined, and it's as reasonable for us to just
2018 # overwrite one with the other as anything else -- anyway, isn't
2019 # that what CVS would do if you checked out the branch? <shrug>
2021 if parent.has_key(ctx.trunk_base):
2022 self.copy_descend(dumper, ctx, name, parent, ctx.trunk_base,
2023 SVN_INVALID_REVNUM, ctx.trunk_base, "",
2024 is_tag, jit_new_rev)
2025 if parent.has_key(ctx.branches_base):
2026 branch_base_key = parent[ctx.branches_base]
2027 branch_base = self.db[branch_base_key]
2028 for this_source in branch_base.keys():
2029 # We skip special names beginning with '/' for the usual
2030 # reason. We skip cases where (this_source == name) for a
2031 # different reason: if a CVS branch were rooted in itself,
2032 # that would imply that the same symbolic name appeared on two
2033 # different branches in an RCS file, which CVS doesn't
2034 # permit. So while it wouldn't hurt to descend, it would be a
2035 # waste of time.
2036 if (this_source[0] != '/') and (this_source != name):
2037 src_path = ctx.branches_base + '/' + this_source
2038 self.copy_descend(dumper, ctx, name, branch_base, this_source,
2039 SVN_INVALID_REVNUM, src_path, "",
2040 is_tag, jit_new_rev)
2042 def fill_tag(self, dumper, ctx, tag, jit_new_rev=None):
2043 """Use DUMPER to create all currently available parts of TAG that
2044 have not been created already. Use CTX.trunk_base, CTX.tags_base,
2045 and CTX.branches_base to determine the source and destination
2046 paths in the Subversion repository.
2048 JIT_NEW_REV is as documented for the copy_descend() function."""
2049 self.fill_name(dumper, ctx, tag, 1, jit_new_rev)
2051 def fill_branch(self, dumper, ctx, branch, jit_new_rev=None):
2052 """Use DUMPER to create all currently available parts of BRANCH that
2053 haven't been created already. Use CTX.trunk_base, CTX.tags_base,
2054 and CTX.branches_base to determine the source and destination
2055 paths in the Subversion repository.
2057 JIT_NEW_REV is as documented for the copy_descend() function."""
2058 self.fill_name(dumper, ctx, branch, None, jit_new_rev)
2060 def finish(self, dumper, ctx):
2061 """Use DUMPER to finish branches and tags that have either
2062 not been created yet, or have been only partially created.
2063 Use CTX.trunk_base, CTX.tags_base, and CTX.branches_base to
2064 determine the source and destination paths in the Subversion
2065 repository."""
2066 parent_key = self.root_key
2067 parent = self.db[parent_key]
2068 # Do all branches first, then all tags. We don't bother to check
2069 # here whether a given name is a branch or a tag, or is done
2070 # already; the fill_foo() methods will just do nothing if there's
2071 # nothing to do.
2073 # We do one revision per branch or tag, for clarity to users, not
2074 # for correctness. In CVS, when you make a branch off a branch,
2075 # the new branch will just root itself in the roots of the old
2076 # branch *except* where the new branch sprouts from a revision
2077 # that was actually committed on the old branch. In the former
2078 # cases, the source paths will be the same as the source paths
2079 # from which the old branch was created and therefore will already
2080 # exist; and in the latter case, the source paths will actually be
2081 # on the old branch, but those paths will exist already because
2082 # they were commits on that branch and therefore cvs2svn must have
2083 # created it already (see the fill_branch call in Commit.commit).
2084 # So either way, the source paths exist by the time we need them.
2086 ### It wouldn't be so awfully hard to determine whether a name is
2087 ### just a branch or just a tag, which would allow for more
2088 ### intuitive messages below.
2089 if not ctx.trunk_only:
2090 print "Finishing branches:"
2091 for name in parent.keys():
2092 if name[0] != '/':
2093 print "finishing '%s' as branch" % name
2094 self.fill_branch(dumper, ctx, name, [1])
2095 print "Finishing tags:"
2096 for name in parent.keys():
2097 if name[0] != '/':
2098 print "finishing '%s' as tag" % name
2099 self.fill_tag(dumper, ctx, name, [1])
2102 def is_trunk_vendor_revision(default_branches_db, cvs_path, cvs_rev):
2103 """Return 1 if CVS_REV of CVS_PATH is a trunk (i.e., head) vendor
2104 revision according to DEFAULT_BRANCHES_DB, else return None."""
2105 if default_branches_db.has_key(cvs_path):
2106 val = default_branches_db[cvs_path]
2107 val_last_dot = val.rindex(".")
2108 received_last_dot = cvs_rev.rindex(".")
2109 default_branch = val[:val_last_dot]
2110 received_branch = cvs_rev[:received_last_dot]
2111 default_rev_component = int(val[val_last_dot + 1:])
2112 received_rev_component = int(cvs_rev[received_last_dot + 1:])
2113 if (default_branch == received_branch
2114 and received_rev_component <= default_rev_component):
2115 return 1
2116 # else
2117 return None
2120 class Commit:
2121 def __init__(self, author, log):
2122 self.author = author
2123 self.log = log
2125 self.files = { }
2127 # For consistency, the elements of both lists are of the form
2129 # (file, rev, deltatext_code, branch_name, tags, branches)
2131 # even though self.deletes doesn't use the deltatext_code.
2132 self.changes = [ ]
2133 self.deletes = [ ]
2135 # Start out with a t_min higher than any incoming time T, and a
2136 # t_max lower than any incoming T. This way the first T will
2137 # push t_min down to T, and t_max up to T, naturally (without any
2138 # special-casing), and successive times will then ratchet them
2139 # outward as appropriate.
2140 self.t_min = 1L<<32
2141 self.t_max = 0
2143 def __cmp__(self, other):
2144 # Commits should be sorted by t_max. If both self and other have
2145 # the same t_max, break the tie using t_min.
2146 return cmp(self.t_max, other.t_max) or cmp(self.t_min, other.t_min)
2148 def has_file(self, fname):
2149 return self.files.has_key(fname)
2151 def add(self, t, op, file, rev, deltatext_code, branch_name, tags, branches):
2152 # Record the time range of this commit.
2154 # ### ISSUE: It's possible, though unlikely, that the time range
2155 # of a commit could get gradually expanded to be arbitrarily
2156 # longer than COMMIT_THRESHOLD. I'm not sure this is a huge
2157 # problem, and anyway deciding where to break it up would be a
2158 # judgement call. For now, we just print a warning in commit() if
2159 # this happens.
2160 if t < self.t_min:
2161 self.t_min = t
2162 if t > self.t_max:
2163 self.t_max = t
2165 if op == OP_CHANGE:
2166 self.changes.append((file, rev, deltatext_code, branch_name,
2167 tags, branches))
2168 else:
2169 # OP_DELETE
2170 self.deletes.append((file, rev, deltatext_code, branch_name,
2171 tags, branches))
2172 self.files[file] = 1
2174 def commit(self, dumper, ctx, sym_tracker):
2175 # commit this transaction
2176 seconds = self.t_max - self.t_min
2177 print 'committing: %s, over %d seconds' % (time.ctime(self.t_min), seconds)
2178 if seconds > COMMIT_THRESHOLD:
2179 print '%s: commit spans more than %d seconds' \
2180 % (warning_prefix, COMMIT_THRESHOLD)
2182 if ctx.dry_run:
2183 for f, r, dt_code, br, tags, branches in self.changes:
2184 # compute a repository path, dropping the ,v from the file name
2185 svn_path = make_path(ctx, relative_name(ctx.cvsroot, f[:-2]), br)
2186 print " adding or changing '%s' : '%s'" % (r, svn_path)
2187 for f, r, dt_code, br, tags, branches in self.deletes:
2188 # compute a repository path, dropping the ,v from the file name
2189 svn_path = make_path(ctx, relative_name(ctx.cvsroot, f[:-2]), br)
2190 print " deleting '%s' : '%s'" % (r, svn_path)
2191 print ' (skipped; dry run enabled)'
2192 return
2194 do_copies = [ ]
2196 # State for handling default branches.
2198 # Here is a tempting, but ultimately nugatory, bit of logic, which
2199 # I share with you so you may appreciate the less attractive, but
2200 # refreshingly non-nugatory, logic which follows it:
2202 # If some of the commits in this txn happened on a non-trunk
2203 # default branch, then those files will have to be copied into
2204 # trunk manually after being changed on the branch (because the
2205 # RCS "default branch" appears as head, i.e., trunk, in practice).
2206 # As long as those copies don't overwrite any trunk paths that
2207 # were also changed in this commit, then we can do the copies in
2208 # the same revision, because they won't cover changes that don't
2209 # appear anywhere/anywhen else. However, if some of the trunk dst
2210 # paths *did* change in this commit, then immediately copying the
2211 # branch changes would lose those trunk mods forever. So in this
2212 # case, we need to do at least that copy in its own revision. And
2213 # for simplicity's sake, if we're creating the new revision for
2214 # even one file, then we just do all such copies together in the
2215 # new revision.
2217 # Doesn't that sound nice?
2219 # Unfortunately, Subversion doesn't support copies with sources
2220 # in the current txn. All copies must be based in committed
2221 # revisions. Therefore, we generate the above-described new
2222 # revision unconditionally.
2224 # Each of these is a list of tuples. Each tuple is of the form:
2226 # (cvs_path, branch_name, tags_rooted_here, branches_rooted_here)
2228 # and a tuple is created for each default branch commit that will
2229 # need to be copied to trunk (or deleted from trunk) in the
2230 # generated revision following the "regular" revision.
2231 default_branch_copies = [ ]
2232 default_branch_deletes = [ ]
2234 # we already have the date, so just format it
2235 date = format_date(self.t_max)
2236 try:
2237 ### FIXME: The 'replace' behavior should be an option, like
2238 ### --encoding is.
2239 unicode_author = unicode(self.author, ctx.encoding, 'replace')
2240 unicode_log = unicode(self.log, ctx.encoding, 'replace')
2241 props = { 'svn:author' : unicode_author.encode('utf8'),
2242 'svn:log' : unicode_log.encode('utf8'),
2243 'svn:date' : date }
2244 except UnicodeError:
2245 print '%s: problem encoding author or log message:' % warning_prefix
2246 print " author: '%s'" % self.author
2247 print " log: '%s'" % self.log
2248 print " date: '%s'" % date
2249 for rcs_file, cvs_rev, dt_code, br, tags, branches in self.changes:
2250 print " rev %s of '%s'" % (cvs_rev, rcs_file)
2251 print "Consider rerunning with (for example) '--encoding=latin1'."
2252 # Just fall back to the original data.
2253 props = { 'svn:author' : self.author,
2254 'svn:log' : self.log,
2255 'svn:date' : date }
2258 # Tells whether we actually wrote anything to the dumpfile.
2259 svn_rev = SVN_INVALID_REVNUM
2261 # If any of the changes we are about to do are on branches, we need to
2262 # check and maybe fill them (in their own revisions) *before* we start
2263 # then data revision. So we have to iterate over changes and deletes twice.
2264 for rcs_file, cvs_rev, dt_code, br, tags, branches in self.changes:
2265 # compute a repository path, dropping the ,v from the file name
2266 cvs_path = relative_name(ctx.cvsroot, rcs_file[:-2])
2267 svn_path = make_path(ctx, cvs_path, br)
2268 if br:
2269 ### FIXME: Here is an obvious optimization point. Probably
2270 ### dump.probe_path(PATH) is kind of slow, because it does N
2271 ### database lookups for the N components in PATH. If this
2272 ### turns out to be a performance bottleneck, we can just
2273 ### maintain a database mirroring just the head tree, but
2274 ### keyed on full paths, to reduce the check to a quick
2275 ### constant time query.
2276 if not dumper.probe_path(svn_path):
2277 sym_tracker.fill_branch(dumper, ctx, br, [1, date])
2279 for rcs_file, cvs_rev, dt_code, br, tags, branches in self.deletes:
2280 # compute a repository path, dropping the ,v from the file name
2281 cvs_path = relative_name(ctx.cvsroot, rcs_file[:-2])
2282 svn_path = make_path(ctx, cvs_path, br)
2283 if br:
2284 ### FIXME: Here is an obvious optimization point. Probably
2285 ### dump.probe_path(PATH) is kind of slow, because it does N
2286 ### database lookups for the N components in PATH. If this
2287 ### turns out to be a performance bottleneck, we can just
2288 ### maintain a database mirroring just the head tree, but
2289 ### keyed on full paths, to reduce the check to a quick
2290 ### constant time query.
2291 if not dumper.probe_path(svn_path):
2292 sym_tracker.fill_branch(dumper, ctx, br, [1, date])
2295 # Now that any branches we need exist, we can do the commits.
2296 for rcs_file, cvs_rev, dt_code, br, tags, branches in self.changes:
2297 # compute a repository path, dropping the ,v from the file name
2298 cvs_path = relative_name(ctx.cvsroot, rcs_file[:-2])
2299 svn_path = make_path(ctx, cvs_path, br)
2300 if svn_rev == SVN_INVALID_REVNUM:
2301 svn_rev = dumper.start_revision(props)
2302 sym_tracker.enroot_tags(svn_path, svn_rev, tags)
2303 sym_tracker.enroot_branches(svn_path, svn_rev, branches)
2304 print " adding or changing %s : '%s'" % (cvs_rev, svn_path)
2306 # Only make a change if we need to. When 1.1.1.1 has an empty
2307 # deltatext, the explanation is almost always that we're looking
2308 # at an imported file whose 1.1 and 1.1.1.1 are identical. On
2309 # such imports, CVS creates an RCS file where 1.1 has the
2310 # content, and 1.1.1.1 has an empty deltatext, i.e, the same
2311 # content as 1.1. There's no reason to reflect this non-change
2312 # in the repository, so we want to do nothing in this case. (If
2313 # we were really paranoid, we could make sure 1.1's log message
2314 # is the CVS-generated "Initial revision\n", but I think the
2315 # conditions below are strict enough.)
2316 if not ((dt_code == DELTATEXT_EMPTY) and (cvs_rev == "1.1.1.1")
2317 and dumper.probe_path(svn_path)):
2318 closed_tags, closed_branches = \
2319 dumper.add_or_change_path(cvs_path,
2320 svn_path,
2321 cvs_rev,
2322 rcs_file,
2323 tags,
2324 branches,
2325 ctx)
2326 if is_trunk_vendor_revision(ctx.default_branches_db,
2327 cvs_path, cvs_rev):
2328 default_branch_copies.append((cvs_path, br, tags, branches))
2329 sym_tracker.close_tags(svn_path, svn_rev, closed_tags)
2330 sym_tracker.close_branches(svn_path, svn_rev, closed_branches)
2332 for rcs_file, cvs_rev, dt_code, br, tags, branches in self.deletes:
2333 # compute a repository path, dropping the ,v from the file name
2334 cvs_path = relative_name(ctx.cvsroot, rcs_file[:-2])
2335 svn_path = make_path(ctx, cvs_path, br)
2336 print " deleting %s : '%s'" % (cvs_rev, svn_path)
2337 if svn_rev == SVN_INVALID_REVNUM:
2338 svn_rev = dumper.start_revision(props)
2339 # Uh, can this even happen on a deleted path? Hmmm. If not,
2340 # there's no risk, since tags and branches would just be empty
2341 # and therefore enrooting would be a no-op. Still, it would
2342 # be clearer to know for sure and simply not call it.
2343 sym_tracker.enroot_tags(svn_path, svn_rev, tags)
2344 sym_tracker.enroot_branches(svn_path, svn_rev, branches)
2345 ### FIXME: this will return path_deleted == None if no path
2346 ### was deleted. But we'll already have started the revision
2347 ### by then, so it's a bit late to use the knowledge! Need to
2348 ### reorganize things so that starting the revision is a
2349 ### callback with its own internal conditional, so anyone can
2350 ### just invoke when they know they're really about to do
2351 ### something.
2353 ### Right now what happens is we get an empty revision
2354 ### (assuming nothing else happened in this revision).
2355 path_deleted, closed_tags, closed_branches = \
2356 dumper.delete_path(svn_path, tags, branches, ctx.prune)
2357 if is_trunk_vendor_revision(ctx.default_branches_db, cvs_path, cvs_rev):
2358 default_branch_deletes.append((cvs_path, br, tags, branches))
2359 sym_tracker.close_tags(svn_path, svn_rev, closed_tags)
2360 sym_tracker.close_branches(svn_path, svn_rev, closed_branches)
2362 if svn_rev == SVN_INVALID_REVNUM:
2363 print ' no new revision created, as nothing to do'
2364 else:
2365 print ' new revision:', svn_rev
2366 if default_branch_copies or default_branch_deletes:
2367 previous_rev = svn_rev
2368 msg = 'This commit was generated by cvs2svn to compensate for ' \
2369 'changes in r%d,\n' \
2370 'which included commits to RCS files with non-trunk default ' \
2371 'branches.\n' % previous_rev
2372 props = { 'svn:author' : 'cvs2svn',
2373 'svn:log' : msg,
2374 'svn:date' : date }
2375 svn_rev = dumper.start_revision(props)
2377 for cvs_path, br, tags, branches in default_branch_copies:
2378 src_path = make_path(ctx, cvs_path, br)
2379 dst_path = make_path(ctx, cvs_path)
2380 if (dumper.probe_path(dst_path)):
2381 ign, closed_tags, closed_branches = \
2382 dumper.delete_path(dst_path, tags, branches, ctx.prune)
2383 sym_tracker.close_tags(dst_path, svn_rev, closed_tags)
2384 sym_tracker.close_branches(dst_path, svn_rev, closed_branches)
2385 dumper.copy_path(src_path, previous_rev, dst_path)
2387 for cvs_path, br, tags, branches in default_branch_deletes:
2388 # Ignore the branch -- we don't need to know the default
2389 # branch, we already know we're deleting this from trunk.
2390 dst_path = make_path(ctx, cvs_path)
2391 if (dumper.probe_path(dst_path)):
2392 ign, closed_tags, closed_branches = \
2393 dumper.delete_path(dst_path, tags, branches, ctx.prune)
2394 sym_tracker.close_tags(dst_path, svn_rev, closed_tags)
2395 sym_tracker.close_branches(dst_path, svn_rev, closed_branches)
2398 def read_resync(fname):
2399 "Read the .resync file into memory."
2401 ### note that we assume that we can hold the entire resync file in
2402 ### memory. really large repositories with whacky timestamps could
2403 ### bust this assumption. should that ever happen, then it is possible
2404 ### to split the resync file into pieces and make multiple passes,
2405 ### using each piece.
2408 # A digest maps to a sequence of lists which specify a lower and upper
2409 # time bound for matching up the commit. We keep a sequence of these
2410 # because a number of checkins with the same log message (e.g. an empty
2411 # log message) could need to be remapped. We also make them a list because
2412 # we will dynamically expand the lower/upper bound as we find commits
2413 # that fall into a particular msg and time range.
2415 # resync == digest -> [ [old_time_lower, old_time_upper, new_time], ... ]
2417 resync = { }
2419 for line in fileinput.FileInput(fname):
2420 t1 = int(line[:8], 16)
2421 digest = line[9:DIGEST_END_IDX]
2422 t2 = int(line[DIGEST_END_IDX+1:], 16)
2423 t1_l = t1 - COMMIT_THRESHOLD/2
2424 t1_u = t1 + COMMIT_THRESHOLD/2
2425 if resync.has_key(digest):
2426 resync[digest].append([t1_l, t1_u, t2])
2427 else:
2428 resync[digest] = [ [t1_l, t1_u, t2] ]
2430 # For each digest, sort the resync items in it in increasing order,
2431 # based on the lower time bound.
2432 digests = resync.keys()
2433 for digest in digests:
2434 (resync[digest]).sort()
2436 return resync
2439 def parse_revs_line(line):
2440 data = line.split(' ', 7)
2441 timestamp = int(data[0], 16)
2442 id = data[1]
2443 op = data[2]
2444 rev = data[3]
2445 deltatext_code = data[4]
2446 branch_name = data[5]
2447 if branch_name == "*":
2448 branch_name = None
2449 ntags = int(data[6])
2450 tags = data[7].split(' ', ntags + 1)
2451 nbranches = int(tags[ntags])
2452 branches = tags[ntags + 1].split(' ', nbranches)
2453 fname = branches[nbranches][:-1] # strip \n
2454 tags = tags[:ntags]
2455 branches = branches[:nbranches]
2457 return timestamp, id, op, rev, deltatext_code, \
2458 fname, branch_name, tags, branches
2461 def write_revs_line(output, timestamp, digest, op, revision,
2462 deltatext_code, fname, branch_name, tags, branches):
2463 output.write('%08lx %s %s %s %s ' % \
2464 (timestamp, digest, op, revision, deltatext_code))
2465 if not branch_name:
2466 branch_name = "*"
2467 output.write('%s ' % branch_name)
2468 output.write('%d ' % (len(tags)))
2469 for tag in tags:
2470 output.write('%s ' % (tag))
2471 output.write('%d ' % (len(branches)))
2472 for branch in branches:
2473 output.write('%s ' % (branch))
2474 output.write('%s\n' % fname)
2477 def pass1(ctx):
2478 cd = CollectData(ctx.cvsroot, DATAFILE, ctx.default_branches_db)
2479 p = rcsparse.Parser()
2480 stats = [ 0 ]
2481 os.path.walk(ctx.cvsroot, visit_file, (cd, p, stats))
2482 if ctx.verbose:
2483 print 'processed', stats[0], 'files'
2484 if len(cd.fatal_errors) > 0:
2485 sys.exit("Pass 1 complete.\n" + "=" * 75 + "\n"
2486 + "Error summary:\n"
2487 + "\n".join(cd.fatal_errors)
2488 + "\nExited due to fatal error(s).")
2490 def pass2(ctx):
2491 "Pass 2: clean up the revision information."
2493 # We may have recorded some changes in revisions' timestamp. We need to
2494 # scan for any other files which may have had the same log message and
2495 # occurred at "the same time" and change their timestamps, too.
2497 # read the resync data file
2498 resync = read_resync(ctx.log_fname_base + RESYNC_SUFFIX)
2500 output = open(ctx.log_fname_base + CLEAN_REVS_SUFFIX, 'w')
2502 # process the revisions file, looking for items to clean up
2503 for line in fileinput.FileInput(ctx.log_fname_base + REVS_SUFFIX):
2504 timestamp, digest, op, rev, deltatext_code, fname, \
2505 branch_name, tags, branches = parse_revs_line(line)
2506 if not resync.has_key(digest):
2507 output.write(line)
2508 continue
2510 # we have a hit. see if this is "near" any of the resync records we
2511 # have recorded for this digest [of the log message].
2512 for record in resync[digest]:
2513 if record[0] <= timestamp <= record[1]:
2514 # bingo! remap the time on this (record[2] is the new time).
2515 write_revs_line(output, record[2], digest, op, rev,
2516 deltatext_code, fname, branch_name, tags, branches)
2518 print "RESYNC: '%s' (%s) : old time='%s' new time='%s'" \
2519 % (relative_name(ctx.cvsroot, fname),
2520 rev, time.ctime(timestamp), time.ctime(record[2]))
2522 # adjust the time range. we want the COMMIT_THRESHOLD from the
2523 # bounds of the earlier/latest commit in this group.
2524 record[0] = min(record[0], timestamp - COMMIT_THRESHOLD/2)
2525 record[1] = max(record[1], timestamp + COMMIT_THRESHOLD/2)
2527 # stop looking for hits
2528 break
2529 else:
2530 # the file/rev did not need to have its time changed.
2531 output.write(line)
2534 def pass3(ctx):
2535 # sort the log files
2537 # GNU sort will sort our dates differently (incorrectly!) if our
2538 # LC_ALL is anything but 'C', so if LC_ALL is set, temporarily set
2539 # it to 'C'
2540 if os.environ.has_key('LC_ALL'):
2541 lc_all_tmp = os.environ['LC_ALL']
2542 else:
2543 lc_all_tmp = None
2544 os.environ['LC_ALL'] = 'C'
2545 run_command('sort %s > %s' % (ctx.log_fname_base + CLEAN_REVS_SUFFIX,
2546 ctx.log_fname_base + SORTED_REVS_SUFFIX))
2547 if lc_all_tmp is None:
2548 del os.environ['LC_ALL']
2549 else:
2550 os.environ['LC_ALL'] = lc_all_tmp
2553 def pass4(ctx):
2554 sym_tracker = SymbolicNameTracker()
2555 metadata_db = Database(METADATA_DB, 'r')
2557 # A dictionary of Commit objects, keyed by digest. Each object
2558 # represents one logical commit, which may involve multiple files.
2560 # The reason this is a dictionary, not a single object, is that
2561 # there may be multiple commits interleaved in time. A commit can
2562 # span up to COMMIT_THRESHOLD seconds, which leaves plenty of time
2563 # for parts of some other commit to occur. Since the s-revs file is
2564 # sorted by timestamp first, then by digest within each timestamp,
2565 # it's quite easy to have interleaved commits.
2566 commits = { }
2568 # The total number of separate commits processed. This is used only for
2569 # printing statistics, it does not affect the results in the repository.
2570 count = 0
2572 # Start the dumpfile object.
2573 dumper = Dumper(ctx)
2575 # process the logfiles, creating the target
2576 for line in fileinput.FileInput(ctx.log_fname_base + SORTED_REVS_SUFFIX):
2577 timestamp, id, op, rev, deltatext_code, fname, \
2578 branch_name, tags, branches = parse_revs_line(line)
2580 if ctx.trunk_only and not trunk_rev.match(rev):
2581 ### note this could/should have caused a flush, but the next item
2582 ### will take care of that for us
2583 continue
2585 # Each time we read a new line, we scan the commits we've
2586 # accumulated so far to see if any are ready for processing now.
2587 process = [ ]
2588 for scan_id, scan_c in commits.items():
2589 if scan_c.t_max + COMMIT_THRESHOLD < timestamp:
2590 process.append(scan_c)
2591 del commits[scan_id]
2592 continue
2593 # If the inbound commit is on the same file as a pending commit,
2594 # close the pending commit to further changes. Don't flush it though,
2595 # as there may be other pending commits dated before this one.
2596 # ### ISSUE: the has_file() check below is not optimal.
2597 # It does fix the dataloss bug where revisions would get lost
2598 # if checked in too quickly, but it can also break apart the
2599 # commits. The correct fix would require tracking the dependencies
2600 # between change sets and committing them in proper order.
2601 if scan_c.has_file(fname):
2602 unused_id = scan_id + '-'
2603 while commits.has_key(unused_id):
2604 unused_id = unused_id + '-'
2605 commits[unused_id] = scan_c
2606 del commits[scan_id]
2608 # If there are any elements in 'process' at this point, they need
2609 # to be committed, because this latest rev couldn't possibly be
2610 # part of any of them. Sort them into time-order, then commit 'em.
2611 process.sort()
2612 for c in process:
2613 c.commit(dumper, ctx, sym_tracker)
2614 count = count + len(process)
2616 # Add this item into the set of still-available commits.
2617 if commits.has_key(id):
2618 c = commits[id]
2619 else:
2620 author, log = metadata_db[id]
2621 c = commits[id] = Commit(author, log)
2622 c.add(timestamp, op, fname, rev, deltatext_code, branch_name,
2623 tags, branches)
2625 # End of the sorted revs file. Flush any remaining commits:
2626 if commits:
2627 process = commits.values()
2628 process.sort()
2629 for c in process:
2630 c.commit(dumper, ctx, sym_tracker)
2631 count = count + len(process)
2633 # Create (or complete) any branches and tags not already done.
2634 sym_tracker.finish(dumper, ctx)
2636 dumper.close()
2638 if ctx.verbose:
2639 print count, 'commits processed.'
2642 def pass5(ctx):
2643 if ctx.skip_cleanup:
2644 return
2646 # Remove our database files
2647 os.unlink(SVN_REVISIONS_DB)
2648 os.unlink(NODES_DB)
2649 os.unlink(SYMBOLIC_NAME_ROOTS_DB)
2650 os.unlink(SYMBOLIC_NAMES_DB)
2651 os.unlink(METADATA_DB)
2653 # This is the only DB reference still reachable at this point; lose
2654 # it before removing the file.
2655 ctx.default_branches_db = None
2656 os.unlink(DEFAULT_BRANCHES_DB)
2658 # Remove our other data files
2659 for suffix in (REVS_SUFFIX, CLEAN_REVS_SUFFIX,
2660 SORTED_REVS_SUFFIX, RESYNC_SUFFIX):
2661 os.unlink('cvs2svn-data' + suffix)
2664 _passes = [
2665 pass1,
2666 pass2,
2667 pass3,
2668 pass4,
2669 pass5,
2673 class _ctx:
2674 pass
2677 class MimeMapper:
2678 "A class that provides mappings from file names to MIME types."
2680 def _init_(self):
2681 self.mappings = { }
2682 self.missing_mappings = { }
2685 def set_mime_types_file(self, mime_types_file):
2686 for line in fileinput.input(mime_types_file):
2687 if line.startswith("#"):
2688 continue
2690 # format of a line is something like
2691 # text/plain c h cpp
2692 extensions = line.split()
2693 if len(extensions) < 2:
2694 continue
2695 type = extensions.pop(0)
2696 for ext in extensions:
2697 if self.mappings.has_key(ext) and self.mappings[ext] != type:
2698 sys.stderr.write("%s: ambiguous MIME mapping for *.%s (%s or %s)\n" \
2699 % (warning_prefix, ext, self.mappings[ext], type))
2700 self.mappings[ext] = type
2703 def get_type_from_filename(self, filename):
2704 basename, extension = os.path.splitext(os.path.basename(filename))
2706 # Extension includes the dot, so strip it (will leave extension
2707 # empty if filename ends with a dot, which is ok):
2708 extension = extension[1:]
2710 # If there is no extension (or the file ends with a period), use
2711 # the base name for mapping. This allows us to set mappings for
2712 # files such as README or Makefile:
2713 if not extension:
2714 extension = basename
2715 if self.mappings.has_key(extension):
2716 return self.mappings[extension]
2717 self.missing_mappings[extension] = 1
2718 return None
2721 def print_missing_mappings(self):
2722 for ext in self.missing_mappings:
2723 sys.stderr.write("%s: no MIME mapping for *.%s\n" % (warning_prefix, ext))
2726 def convert(ctx, start_pass=1):
2727 "Convert a CVS repository to an SVN repository."
2729 if not os.path.exists(ctx.cvsroot):
2730 sys.stderr.write(error_prefix + ': \'%s\' does not exist.\n' % ctx.cvsroot)
2731 sys.exit(1)
2733 times = [ None ] * len(_passes)
2734 for i in range(start_pass - 1, len(_passes)):
2735 times[i] = time.time()
2736 print '----- pass %d -----' % (i + 1)
2737 _passes[i](ctx)
2738 times.append(time.time())
2740 for i in range(start_pass, len(_passes)+1):
2741 print 'pass %d: %d seconds' % (i, int(times[i] - times[i-1]))
2742 print ' total:', int(times[len(_passes)] - times[start_pass-1]), 'seconds'
2745 def usage(ctx):
2746 print 'USAGE: %s [-n] [-v] [-s svn-repos-path] [-p pass] cvs-repos-path' \
2747 % os.path.basename(sys.argv[0])
2748 print ' --help, -h print this usage message and exit with success'
2749 print ' -n dry run; parse CVS repos, but do not construct SVN repos'
2750 print ' -v verbose'
2751 print ' -s PATH path for SVN repos'
2752 print ' -p NUM start at pass NUM of %d' % len(_passes)
2753 print ' --existing-svnrepos load into existing SVN repository'
2754 print ' --dumpfile=PATH name of intermediate svn dumpfile'
2755 print ' --svnadmin=PATH path to the svnadmin program'
2756 print ' --trunk-only convert only trunk commits, not tags nor branches'
2757 print ' --trunk=PATH path for trunk (default: %s)' \
2758 % ctx.trunk_base
2759 print ' --branches=PATH path for branches (default: %s)' \
2760 % ctx.branches_base
2761 print ' --tags=PATH path for tags (default: %s)' \
2762 % ctx.tags_base
2763 print ' --no-prune don\'t prune empty directories'
2764 print ' --dump-only just produce a dumpfile, don\'t commit to a repos'
2765 print ' --encoding=ENC encoding of log messages in CVS repos (default: %s)' \
2766 % ctx.encoding
2767 print ' --username=NAME username for cvs2svn-synthesized commits'
2768 print ' (default: %s)' \
2769 % ctx.username
2770 print ' --skip-cleanup prevent the deletion of intermediate files'
2771 print ' --bdb-txn-nosync pass --bdb-txn-nosync to "svnadmin create"'
2772 print ' --cvs-revnums record CVS revision numbers as file properties'
2773 print ' --mime-types=FILE specify an apache-style mime.types file for\n' \
2774 ' setting svn:mime-type'
2775 print ' --set-eol-style automatically set svn:eol-style=native for\n' \
2776 ' text files (needs --mime-types)'
2779 def main():
2780 # prepare the operation context
2781 ctx = _ctx()
2782 ctx.cvsroot = None
2783 ctx.target = None
2784 ctx.log_fname_base = DATAFILE
2785 ctx.dumpfile = DUMPFILE
2786 ctx.verbose = 0
2787 ctx.dry_run = 0
2788 ctx.prune = 1
2789 ctx.existing_svnrepos = 0
2790 ctx.dump_only = 0
2791 ctx.trunk_only = 0
2792 ctx.trunk_base = "trunk"
2793 ctx.tags_base = "tags"
2794 ctx.branches_base = "branches"
2795 ctx.encoding = "ascii"
2796 ctx.mime_types_file = None
2797 ctx.mime_mapper = None
2798 ctx.set_eol_style = 0
2799 ctx.svnadmin = "svnadmin"
2800 ctx.username = "unknown"
2801 ctx.print_help = 0
2802 ctx.skip_cleanup = 0
2803 ctx.cvs_revnums = 0
2804 ctx.bdb_txn_nosync = 0
2806 start_pass = 1
2808 try:
2809 opts, args = getopt.getopt(sys.argv[1:], 'p:s:vnh',
2810 [ "help", "create", "trunk=",
2811 "username=", "existing-svnrepos",
2812 "branches=", "tags=", "encoding=",
2813 "mime-types=", "set-eol-style",
2814 "trunk-only", "no-prune",
2815 "dump-only", "dumpfile=", "svnadmin=",
2816 "skip-cleanup", "cvs-revnums",
2817 "bdb-txn-nosync"])
2818 except getopt.GetoptError, e:
2819 sys.stderr.write(error_prefix + ': ' + str(e) + '\n\n')
2820 usage(ctx)
2821 sys.exit(1)
2823 for opt, value in opts:
2824 if opt == '-p':
2825 start_pass = int(value)
2826 if start_pass < 1 or start_pass > len(_passes):
2827 print '%s: illegal value (%d) for starting pass. ' \
2828 'must be 1 through %d.' % (error_prefix, start_pass,
2829 len(_passes))
2830 sys.exit(1)
2831 elif (opt == '--help') or (opt == '-h'):
2832 ctx.print_help = 1
2833 elif opt == '-v':
2834 ctx.verbose = 1
2835 elif opt == '-n':
2836 ctx.dry_run = 1
2837 elif opt == '-s':
2838 ctx.target = value
2839 elif opt == '--existing-svnrepos':
2840 ctx.existing_svnrepos = 1
2841 elif opt == '--dumpfile':
2842 ctx.dumpfile = value
2843 elif opt == '--svnadmin':
2844 ctx.svnadmin = value
2845 elif opt == '--trunk-only':
2846 ctx.trunk_only = 1
2847 elif opt == '--trunk':
2848 ctx.trunk_base = value
2849 elif opt == '--branches':
2850 ctx.branches_base = value
2851 elif opt == '--tags':
2852 ctx.tags_base = value
2853 elif opt == '--no-prune':
2854 ctx.prune = None
2855 elif opt == '--dump-only':
2856 ctx.dump_only = 1
2857 elif opt == '--encoding':
2858 ctx.encoding = value
2859 elif opt == '--mime-types':
2860 ctx.mime_types_file = value
2861 elif opt == '--set-eol-style':
2862 ctx.set_eol_style = 1
2863 elif opt == '--username':
2864 ctx.username = value
2865 elif opt == '--skip-cleanup':
2866 ctx.skip_cleanup = 1
2867 elif opt == '--cvs-revnums':
2868 ctx.cvs_revnums = 1
2869 elif opt == '--bdb-txn-nosync':
2870 ctx.bdb_txn_nosync = 1
2871 elif opt == '--create':
2872 sys.stderr.write(warning_prefix +
2873 ': The behaviour produced by the --create option is now the '
2874 'default,\nand passing the option is deprecated.\n')
2876 if ctx.print_help:
2877 usage(ctx)
2878 sys.exit(0)
2880 # Consistency check for options and arguments.
2881 if len(args) == 0:
2882 usage(ctx)
2883 sys.exit(1)
2885 if len(args) > 1:
2886 sys.stderr.write(error_prefix +
2887 ": must pass only one CVS repository.\n")
2888 usage(ctx)
2889 sys.exit(1)
2891 ctx.cvsroot = args[0]
2893 if not os.path.isdir(ctx.cvsroot):
2894 sys.stderr.write(error_prefix +
2895 ": the cvs-repos-path '%s' is not an "
2896 "existing directory.\n" % ctx.cvsroot)
2897 sys.exit(1)
2899 if (not ctx.target) and (not ctx.dump_only):
2900 sys.stderr.write(error_prefix +
2901 ": must pass one of '-s' or '--dump-only'.\n")
2902 sys.exit(1)
2904 def not_both(opt1val, opt1name, opt2val, opt2name):
2905 if opt1val and opt2val:
2906 sys.stderr.write(error_prefix + ": cannot pass both '%s' and '%s'.\n" \
2907 % (opt1name, opt2name))
2909 not_both(ctx.target, '-s', ctx.dump_only, '--dump-only')
2911 not_both(ctx.dump_only, '--dump-only',
2912 ctx.existing_svnrepos, '--existing-svnrepos')
2914 not_both(ctx.bdb_txn_nosync, '--bdb-txn-nosync',
2915 ctx.existing_svnrepos, '--existing-svnrepos')
2917 not_both(ctx.dump_only, '--dump-only',
2918 ctx.bdb_txn_nosync, '--bdb-txn-nosync')
2920 if ((string.find(ctx.trunk_base, '/') > -1)
2921 or (string.find(ctx.tags_base, '/') > -1)
2922 or (string.find(ctx.branches_base, '/') > -1)):
2923 sys.stderr.write("%s: cannot pass multicomponent path to "
2924 "--trunk, --tags, or --branches yet.\n"
2925 " See http://subversion.tigris.org/issues/show_bug.cgi?"
2926 "id=1409 "
2927 "for details.\n" % error_prefix)
2928 sys.exit(1)
2930 if ctx.existing_svnrepos and not os.path.isdir(ctx.target):
2931 sys.stderr.write(error_prefix +
2932 ": the svn-repos-path '%s' is not an "
2933 "existing directory.\n" % ctx.target)
2934 sys.exit(1)
2936 if not ctx.dump_only and not ctx.existing_svnrepos \
2937 and os.path.exists(ctx.target):
2938 sys.stderr.write(error_prefix +
2939 ": the svn-repos-path '%s' exists.\nRemove it, or pass "
2940 "'--existing-svnrepos'.\n" % ctx.target)
2941 sys.exit(1)
2943 if ctx.set_eol_style and not ctx.mime_types_file:
2944 sys.stderr.write(error_prefix +
2945 ": can only pass '--set-eol-style' if you also pass"
2946 " '--mime-types'.\n")
2947 sys.exit(1)
2949 if ctx.mime_types_file:
2950 ctx.mime_mapper = MimeMapper()
2951 ctx.mime_mapper.set_mime_types_file(ctx.mime_types_file)
2953 # Lock the current directory for temporary files.
2954 try:
2955 os.mkdir('cvs2svn.lock')
2956 except OSError:
2957 sys.stderr.write(error_prefix +
2958 ": cvs2svn writes temporary files to the current working directory.\n"
2959 " The directory 'cvs2svn.lock' exists, indicating that another\n"
2960 " cvs2svn process is currently using the current directory for its\n"
2961 " temporary workspace. If you are certain that is not the case,\n"
2962 " remove the 'cvs2svn.lock' directory.\n")
2963 sys.exit(1)
2964 try:
2965 ctx.default_branches_db = Database(DEFAULT_BRANCHES_DB, 'n')
2966 convert(ctx, start_pass=start_pass)
2967 finally:
2968 try: os.rmdir('cvs2svn.lock')
2969 except: pass
2971 if ctx.mime_types_file:
2972 ctx.mime_mapper.print_missing_mappings()
2974 if __name__ == '__main__':
2975 main()