Fix issue 1514.
[cvs2svn.git] / cvs2svn.py
blobef8490a3214fda165f858e44b2851567f6aedeb3
1 #!/usr/bin/env python
3 # cvs2svn: ...
6 # $LastChangedRevision$
8 import rcsparse
9 import os
10 import sys
11 import sha
12 import re
13 import time
14 import fileinput
15 import string
16 import getopt
17 import stat
18 import md5
19 import anydbm
20 import marshal
22 # Make sure this Python is recent enough.
23 import sys
24 if sys.hexversion < 0x2000000:
25 sys.stderr.write('Python 2.0 or higher is required; see www.python.org.\n')
26 sys.exit(1)
28 # Don't settle for less.
29 if anydbm._defaultmod.__name__ == 'dumbdbm':
30 print 'ERROR: your installation of Python does not contain a proper'
31 print ' DBM module. This script cannot continue.'
32 print ' to solve: see http://python.org/doc/current/lib/module-anydbm.html'
33 print ' for details.'
34 sys.exit(1)
36 trunk_rev = re.compile('^[0-9]+\\.[0-9]+$')
37 branch_tag = re.compile('^[0-9.]+\\.0\\.[0-9]+$')
38 vendor_tag = re.compile('^[0-9]+\\.[0-9]+\\.[0-9]+$')
40 DATAFILE = 'cvs2svn-data'
41 DUMPFILE = 'cvs2svn-dump' # The "dumpfile" we create to load into the repos
43 # Skeleton version of an svn filesystem.
44 SVN_REVISIONS_DB = 'cvs2svn-revisions.db'
45 NODES_DB = 'cvs2svn-nodes.db'
46 SYMBOLIC_NAME_ROOTS_DB = 'cvs2svn-symroots.db'
48 # See class SymbolicNameTracker for details.
49 SYMBOLIC_NAMES_DB = "cvs2svn-sym-names.db"
51 REVS_SUFFIX = '.revs'
52 CLEAN_REVS_SUFFIX = '.c-revs'
53 SORTED_REVS_SUFFIX = '.s-revs'
54 RESYNC_SUFFIX = '.resync'
56 ATTIC = os.sep + 'Attic'
58 SVN_INVALID_REVNUM = -1
60 COMMIT_THRESHOLD = 5 * 60 # flush a commit if a 5 minute gap occurs
62 OP_NOOP = '-'
63 OP_ADD = 'A'
64 OP_DELETE = 'D'
65 OP_CHANGE = 'C'
67 DIGEST_END_IDX = 9 + (sha.digestsize * 2)
69 verbose = 1
72 # Officially, CVS symbolic names must use a fairly restricted set of
73 # characters. Unofficially, we don't care if some repositories out
74 # there don't abide by this, as long as their tags start with a letter
75 # and don't include '/' or '\' (both of which are prohibited by
76 # official restrictions anyway).
77 symbolic_name_re = re.compile('^[a-zA-Z][^/\\\\]*$')
79 class CollectData(rcsparse.Sink):
80 def __init__(self, cvsroot, log_fname_base):
81 self.cvsroot = cvsroot
82 self.revs = open(log_fname_base + REVS_SUFFIX, 'w')
83 self.resync = open(log_fname_base + RESYNC_SUFFIX, 'w')
85 def set_fname(self, fname):
86 "Prepare to receive data for a new file."
87 self.fname = fname
89 # revision -> [timestamp, author, operation, old-timestamp]
90 self.rev_data = { }
91 self.prev = { }
92 self.branch_names = {}
93 self.taglist = {}
94 self.branchlist = {}
96 def set_branch_name(self, revision, name):
97 """Record that REVISION is the branch number for BRANCH_NAME.
98 REVISION is an RCS branch number with an odd number of components,
99 for example '1.7.2' (never '1.7.0.2')."""
100 if self.branch_names.has_key(revision):
101 sys.stderr.write("Error while parsing '%s':\n"
102 " branch %s already has name '%s',\n"
103 " cannot also have name '%s'.\n" \
104 % (self.fname, revision,
105 self.branch_names[revision], name))
106 sys.exit(1)
107 self.branch_names[revision] = name
109 def get_branch_name(self, revision):
110 """Return the name of the branch on which REVISION lies.
111 REVISION is a non-branch evision number with an even number of,
112 components, for example '1.7.2.1' (never '1.7.2' nor '1.7.0.2')."""
113 return self.branch_names.get(revision[:revision.rindex(".")])
115 def add_branch_point(self, revision, branch_name):
116 """Record that BRANCH_NAME sprouts from REVISION.
117 REVISION is a non-branch revision number with an even number of
118 components, for example '1.7' (never '1.7.2' nor '1.7.0.2')."""
119 if not self.branchlist.has_key(revision):
120 self.branchlist[revision] = []
121 self.branchlist[revision].append(branch_name)
123 def add_cvs_branch(self, revision, branch_name):
124 """Record the root revision and branch revision for BRANCH_NAME,
125 based on REVISION. REVISION is a CVS branch number having an even
126 number of components where the second-to-last is '0'. For
127 example, if it's '1.7.0.2', then record that BRANCH_NAME sprouts
128 from 1.7 and has branch number 1.7.2."""
129 last_dot = revision.rfind(".")
130 branch_rev = revision[:last_dot]
131 last2_dot = branch_rev.rfind(".")
132 branch_rev = branch_rev[:last2_dot] + revision[last_dot:]
133 self.set_branch_name(branch_rev, branch_name)
134 self.add_branch_point(branch_rev[:last2_dot], branch_name)
136 def get_tags(self, revision):
137 """Return a list of all tag names attached to REVISION.
138 REVISION is a regular revision number like '1.7', and the result
139 never includes branch names, only plain tags."""
140 return self.taglist.get(revision, [])
142 def get_branches(self, revision):
143 """Return a list of all branch names that sprout from REVISION.
144 REVISION is a regular revision number like '1.7'."""
145 return self.branchlist.get(revision, [])
147 def define_tag(self, name, revision):
148 """Record a bidirectional mapping between symbolic NAME and REVISION
149 REVISION is an unprocessed revision number from the RCS file's
150 header, for example: '1.7', '1.7.0.2', or '1.1.1' or '1.1.1.1'.
151 This function will determine what kind of symbolic name it is by
152 inspection, and record it in the right places."""
153 if not symbolic_name_re.match(name):
154 sys.stderr.write("Error while parsing %s:\n"
155 " '%s' is not a valid tag or branch name.\n" \
156 % (self.fname, name))
157 sys.exit(1)
158 if branch_tag.match(revision):
159 self.add_cvs_branch(revision, name)
160 elif vendor_tag.match(revision):
161 self.set_branch_name(revision, name)
162 self.add_branch_point(revision[:revision.rfind(".")], name)
163 else:
164 if not self.taglist.has_key(revision):
165 self.taglist[revision] = []
166 self.taglist[revision].append(name)
168 def define_revision(self, revision, timestamp, author, state,
169 branches, next):
170 ### what else?
171 if state == 'dead':
172 op = OP_DELETE
173 else:
174 op = OP_CHANGE
176 # store the rev_data as a list in case we have to jigger the timestamp
177 self.rev_data[revision] = [int(timestamp), author, op, None]
179 # record the previous revision for sanity checking later
180 if trunk_rev.match(revision):
181 self.prev[revision] = next
182 elif next:
183 self.prev[next] = revision
184 for b in branches:
185 self.prev[b] = revision
187 def tree_completed(self):
188 "The revision tree has been parsed. Analyze it for consistency."
190 # Our algorithm depends upon the timestamps on the revisions occuring
191 # monotonically over time. That is, we want to see rev 1.34 occur in
192 # time before rev 1.35. If we inserted 1.35 *first* (due to the time-
193 # sorting), and then tried to insert 1.34, we'd be screwed.
195 # to perform the analysis, we'll simply visit all of the 'previous'
196 # links that we have recorded and validate that the timestamp on the
197 # previous revision is before the specified revision
199 # if we have to resync some nodes, then we restart the scan. just keep
200 # looping as long as we need to restart.
201 while 1:
202 for current, prev in self.prev.items():
203 if not prev:
204 # no previous revision exists (i.e. the initial revision)
205 continue
206 t_c = self.rev_data[current][0]
207 t_p = self.rev_data[prev][0]
208 if t_p >= t_c:
209 # the previous revision occurred later than the current revision.
210 # shove the previous revision back in time (and any before it that
211 # may need to shift).
212 while t_p >= t_c:
213 self.rev_data[prev][0] = t_c - 1 # new timestamp
214 self.rev_data[prev][3] = t_p # old timestamp
216 print 'RESYNC: %s (%s) : old time="%s" new time="%s"' \
217 % (relative_name(self.cvsroot, self.fname),
218 prev, time.ctime(t_p), time.ctime(t_c - 1))
220 current = prev
221 prev = self.prev[current]
222 if not prev:
223 break
224 t_c = t_c - 1 # self.rev_data[current][0]
225 t_p = self.rev_data[prev][0]
227 # break from the for-loop
228 break
229 else:
230 # finished the for-loop (no resyncing was performed)
231 return
233 def set_revision_info(self, revision, log, text):
234 timestamp, author, op, old_ts = self.rev_data[revision]
235 digest = sha.new(log + '\0' + author).hexdigest()
236 if old_ts:
237 # the timestamp on this revision was changed. log it for later
238 # resynchronization of other files's revisions that occurred
239 # for this time and log message.
240 self.resync.write('%08lx %s %08lx\n' % (old_ts, digest, timestamp))
242 branch_name = self.get_branch_name(revision)
244 write_revs_line(self.revs, timestamp, digest, op, revision, self.fname,
245 branch_name, self.get_tags(revision),
246 self.get_branches(revision))
249 def make_path(ctx, path, branch_name = None, tag_name = None):
250 """Return the trunk path, branch path, or tag path for PATH.
251 CTX holds the name of the branches or tags directory, which is
252 prepended to PATH when constructing a branch or tag path.
254 If PATH is empty or None, return the root trunk|branch|tag path.
256 It is an error to pass both a BRANCH_NAME and a TAG_NAME."""
258 # For a while, we treated each top-level subdir of the CVS
259 # repository as a "project root" and interpolated the appropriate
260 # genealogy (trunk|tag|branch) in according to the official
261 # recommended layout. For example, the path '/foo/bar/baz.c' on
262 # branch 'Rel2' would become
264 # /foo/branches/Rel2/bar/baz.c
266 # and on trunk it would become
268 # /foo/trunk/bar/baz.c
270 # However, we went back to the older and simpler method of just
271 # prepending the genealogy to the front, instead of interpolating.
272 # So now we produce:
274 # /branches/Rel2/foo/bar/baz.c
275 # /trunk/foo/bar/baz.c
277 # Why? Well, Jack Repenning pointed out that this way is much
278 # friendlier to "anonymously rooted subtrees" (that's a tree where
279 # the name of the top level dir doesn't matter, the point is that if
280 # you cd into it and, say, run 'make', something good will happen).
281 # By interpolating, we made it impossible to point cvs2svn at some
282 # subdir in the CVS repository and convert it as a project, because
283 # we'd treat every subdir underneath it as an independent project
284 # root, which is probably not what the user wanted.
286 # Also, see Blair Zajac's post
288 # http://subversion.tigris.org/servlets/ReadMsg?list=dev&msgNo=38965
290 # and the surrounding thread, for why what people really want is a
291 # way of specifying an in-repository prefix path, not interpolation.
293 if branch_name and tag_name:
294 sys.stderr.write('make_path() miscalled: both branch and tag given.\n')
295 sys.exit(1)
297 if branch_name:
298 if path:
299 return ctx.branches_base + '/' + branch_name + '/' + path
300 else:
301 return ctx.branches_base + '/' + branch_name
302 elif tag_name:
303 if path:
304 return ctx.tags_base + '/' + tag_name + '/' + path
305 else:
306 return ctx.tags_base + '/' + tag_name
307 else:
308 if path:
309 return ctx.trunk_base + '/' + path
310 else:
311 return ctx.trunk_base
314 def relative_name(cvsroot, fname):
315 l = len(cvsroot)
316 if fname[:l] == cvsroot:
317 if fname[l] == '/':
318 return fname[l+1:]
319 return fname[l:]
320 sys.stderr.write('relative_path("%s", "%s"): fname is not a sub-path of'
321 ' cvsroot\n' % (cvsroot, fname))
322 sys.exit(1)
325 def visit_file(arg, dirname, files):
326 cd, p, stats = arg
327 for fname in files:
328 if fname[-2:] != ',v':
329 continue
330 pathname = os.path.join(dirname, fname)
331 if dirname[-6:] == ATTIC:
332 # drop the 'Attic' portion from the pathname
333 ### we should record this so we can easily insert it back in
334 cd.set_fname(os.path.join(dirname[:-6], fname))
335 else:
336 cd.set_fname(pathname)
337 if verbose:
338 print pathname
339 try:
340 p.parse(open(pathname, 'rb'), cd)
341 stats[0] = stats[0] + 1
342 except rcsparse.common.RCSExpected:
343 print "Warning: '%s' is not a valid ,v file, ignoring" % pathname
346 def is_vendor_first_revision(cvs_rev):
347 """Return true if CVS_REV is the first revision on a vendor branch,
348 false otherwise. If CVS_REV has an even number of components, and
349 last component is 1 and the component before that is odd, then it is
350 the first revision on a vendor branch."""
351 c = string.split(cvs_rev, '.')
352 n = len(c)
353 if ((n > 2) and (n % 2 == 0) and (c[-1] == '1') and (int(c[-2]) % 2 == 1)):
354 return 1
355 else:
356 return None
359 class RevInfoParser(rcsparse.Sink):
360 def __init__(self):
361 self.authors = { } # revision -> author
362 self.logs = { } # revision -> log message
364 def define_revision(self, revision, timestamp, author, state,
365 branches, next):
366 self.authors[revision] = author
368 def set_revision_info(self, revision, log, text):
369 self.logs[revision] = log
371 def parse_cvs_file(self, rcs_pathname):
372 try:
373 rcsfile = open(rcs_pathname, 'rb')
374 except:
375 try:
376 dirname, fname = os.path.split(rcs_pathname)
377 rcs_pathname = os.path.join(dirname, "Attic", fname)
378 rcsfile = open(rcs_pathname, 'rb')
379 except:
380 ### should use a better error
381 raise RuntimeError, ('error: %s appeared to be under CVS control, '
382 'but the RCS file is inaccessible.'
383 % rcs_pathname)
385 rcsparse.Parser().parse(rcsfile, self)
388 # Return a string that has not been returned by gen_key() before.
389 gen_key_base = 0L
390 def gen_key():
391 global gen_key_base
392 key = '%x' % gen_key_base
393 gen_key_base = gen_key_base + 1
394 return key
397 class Change:
398 """Class for recording what actually happened when a change is made,
399 because not all of the result is guessable by the caller.
400 See RepositoryMirror.change_path() for more.
402 The fields are
405 'A' if path was added, 'C' if changed, or '-' if no action.
407 closed_tags:
408 List of tags that this path can no longer be the source of,
409 that is, tags which could be rooted in the path before the
410 change, but not after.
412 closed_branches:
413 Like closed_tags, but for branches.
415 deleted_entries:
416 The list of entries deleted from the destination after
417 copying a directory, or None.
419 copyfrom_rev:
420 The actual revision from which the path was copied, which
421 may be one less than the requested revision when the path
422 was deleted in the requested revision, or None."""
423 def __init__(self, op, closed_tags, closed_branches,
424 deleted_entries=None, copyfrom_rev=None):
425 self.op = op
426 self.closed_tags = closed_tags
427 self.closed_branches = closed_branches
428 self.deleted_entries = deleted_entries
429 self.copyfrom_rev = copyfrom_rev
432 class RepositoryMirror:
433 def __init__(self):
434 # This corresponds to the 'revisions' table in a Subversion fs.
435 self.revs_db_file = SVN_REVISIONS_DB
436 self.revs_db = anydbm.open(self.revs_db_file, 'n')
438 # This corresponds to the 'nodes' table in a Subversion fs. (We
439 # don't need a 'representations' or 'strings' table because we
440 # only track metadata, not file contents.
441 self.nodes_db_file = NODES_DB
442 self.nodes_db = anydbm.open(self.nodes_db_file, 'n')
444 # This tracks which symbolic names the current "head" of a given
445 # filepath could be the origin node for. When the next commit on
446 # that path comes along, we can tell which symbolic names
447 # originated in the previous version, and signal back to the
448 # caller that the file can no longer be the origin for those names.
450 # The values are marshalled tuples, (tags, branches), where each
451 # value is a list.
452 self.symroots_db_file = SYMBOLIC_NAME_ROOTS_DB
453 self.symroots_db = anydbm.open(self.symroots_db_file, 'n')
455 # When copying a directory (say, to create part of a branch), we
456 # pass change_path() a list of expected entries, so it can remove
457 # any that are in the source but don't belong on the branch.
458 # However, because creating a given region of a branch can involve
459 # copying from several sources, we don't want later copy
460 # operations to delete entries that were legitimately created by
461 # earlier copy ops. So after a copy, the directory records
462 # legitimate entries under this key, in a dictionary (the keys are
463 # entry names, the values can be ignored).
464 self.approved_entries = "/approved-entries"
466 # Set on a directory that's mutable in the revision currently
467 # being constructed. (Yes, this is exactly analogous to
468 # the Subversion filesystem code's concept of mutability.)
469 self.mutable_flag = "/mutable"
470 # This could represent a new mutable directory or file.
471 self.empty_mutable_thang = { self.mutable_flag : 1 }
473 # Init a root directory with no entries at revision 0.
474 self.youngest = 0
475 self.revs_db[str(self.youngest)] = gen_key()
476 self.nodes_db[self.revs_db[str(self.youngest)]] = marshal.dumps({})
478 def new_revision(self):
479 """Stabilize the current revision, then start the next one.
480 (Increments youngest.)"""
481 self.stabilize_youngest()
482 self.revs_db[str(self.youngest + 1)] \
483 = self.revs_db[str(self.youngest)]
484 self.youngest = self.youngest + 1
486 def _stabilize_directory(self, key):
487 """Close the directory whose node key is KEY."""
488 dir = marshal.loads(self.nodes_db[key])
489 if dir.has_key(self.mutable_flag):
490 del dir[self.mutable_flag]
491 if dir.has_key(self.approved_entries):
492 del dir[self.approved_entries]
493 for entry_key in dir.keys():
494 if not entry_key[0] == '/':
495 self._stabilize_directory(dir[entry_key])
496 self.nodes_db[key] = marshal.dumps(dir)
498 def stabilize_youngest(self):
499 """Stabilize the current revision by removing mutable flags."""
500 root_key = self.revs_db[str(self.youngest)]
501 self._stabilize_directory(root_key)
503 def probe_path(self, path, revision=-1, debugging=None):
504 """If PATH exists in REVISION of the svn repository mirror,
505 return its leaf value, else return None.
506 If DEBUGGING is true, then print trace output to stdout.
507 REVISION defaults to youngest, and PATH must not start with '/'."""
508 components = string.split(path, '/')
509 if revision == -1:
510 revision = self.youngest
512 if debugging:
513 print "PROBING path: '%s' in %d" % (path, revision)
515 parent_key = self.revs_db[str(revision)]
516 parent = marshal.loads(self.nodes_db[parent_key])
517 previous_component = "/"
519 i = 1
520 for component in components:
522 if debugging:
523 print " " * i,
524 print "'%s' key: %s, val:" % (previous_component, parent_key), parent
526 if not parent.has_key(component):
527 if debugging:
528 print " PROBE ABANDONED: '%s' does not contain '%s'" \
529 % (previous_component, component)
530 return None
532 this_entry_key = parent[component]
533 this_entry_val = marshal.loads(self.nodes_db[this_entry_key])
534 parent_key = this_entry_key
535 parent = this_entry_val
536 previous_component = component
537 i = i + 1
539 if debugging:
540 print " " * i,
541 print "parent_key: %s, val:" % parent_key, parent
543 # It's not actually a parent at this point, it's the leaf node.
544 return parent
546 def change_path(self, path, tags, branches,
547 intermediate_dir_func=None,
548 copyfrom_path=None, copyfrom_rev=None,
549 expected_entries=None, only_if_already_exists=None):
550 """Record a change to PATH. PATH may not have a leading slash.
551 Return a Change instance representing the result of the
552 change.
554 TAGS are any tags that sprout from this revision of PATH, BRANCHES
555 are any branches that sprout from this revision of PATH.
557 If INTERMEDIATE_DIR_FUNC is not None, then invoke it once on
558 each full path to each missing intermediate directory in PATH, in
559 order from shortest to longest.
561 If COPYFROM_REV and COPYFROM_PATH are not None, then they are a
562 revision and path to record as the copyfrom sources of this node.
563 Since this implies an 'A'dd, it would be reasonable to error and
564 exit if the copyfrom args are present but the node also already
565 exists. Reasonable -- but not what we do :-). The most useful
566 behavior for callers is instead to report that nothing was done,
567 by returning '-' for Change.op, so that's what we do.
569 It is an error for only one copyfrom argument to be present.
571 If EXPECTED_ENTRIES is not None, then it holds entries expected
572 to be in the dst after the copy. Any entries in the new dst but
573 not in EXPECTED_ENTRIES are removed (ignoring keys beginning with
574 '/'), and the removed entries returned in Change.deleted_entries,
575 which are otherwise None.
577 No action is taken for keys in EXPECTED_ENTRIES but not in the
578 dst; it is assumed that the caller will compensate for these by
579 calling change_path again with other arguments.
581 If ONLY_IF_ALREADY_EXISTS is set, then do a no-op, rather than an add,
582 if the path does not exist. This is to allow pruning using EXPECTED_ENTRIES
583 without risking erroneously adding a path."""
584 if ((copyfrom_rev and not copyfrom_path) or
585 (copyfrom_path and not copyfrom_rev)):
586 sys.stderr.write("error: change_path() called with one copyfrom "
587 "argument but not the other.\n")
588 sys.exit(1)
590 components = string.split(path, '/')
591 path_so_far = None
593 parent_key = self.revs_db[str(self.youngest)]
594 parent = marshal.loads(self.nodes_db[parent_key])
595 if not parent.has_key(self.mutable_flag):
596 parent_key = gen_key()
597 parent[self.mutable_flag] = 1
598 self.nodes_db[parent_key] = marshal.dumps(parent)
599 self.revs_db[str(self.youngest)] = parent_key
601 for component in components[:-1]:
602 # parent is always mutable at the top of the loop
604 if path_so_far:
605 path_so_far = path_so_far + '/' + component
606 else:
607 path_so_far = component
609 # Ensure that the parent has an entry for this component.
610 if not parent.has_key(component):
611 if only_if_already_exists:
612 if expected_entries:
613 return Change(OP_NOOP, [], [], [])
614 else:
615 return Change(OP_NOOP, [], [])
616 # else
617 new_child_key = gen_key()
618 parent[component] = new_child_key
619 self.nodes_db[new_child_key] = marshal.dumps(self.empty_mutable_thang)
620 self.nodes_db[parent_key] = marshal.dumps(parent)
621 if intermediate_dir_func:
622 intermediate_dir_func(path_so_far)
624 # One way or another, parent dir now has an entry for component,
625 # so grab it, see if it's mutable, and DTRT if it's not. (Note
626 # it's important to reread the entry value from the db, even
627 # though we might have just written it -- if we tweak existing
628 # data structures, we could modify self.empty_mutable_thang,
629 # which must not happen.)
630 this_entry_key = parent[component]
631 this_entry_val = marshal.loads(self.nodes_db[this_entry_key])
632 if not this_entry_val.has_key(self.mutable_flag):
633 this_entry_val[self.mutable_flag] = 1
634 this_entry_key = gen_key()
635 parent[component] = this_entry_key
636 self.nodes_db[this_entry_key] = marshal.dumps(this_entry_val)
637 self.nodes_db[parent_key] = marshal.dumps(parent)
639 parent_key = this_entry_key
640 parent = this_entry_val
642 # Now change the last node, the versioned file. Just like at the
643 # top of the above loop, parent is already mutable.
644 op = OP_ADD
645 if self.symroots_db.has_key(path):
646 old_names = marshal.loads(self.symroots_db[path])
647 else:
648 old_names = [], []
649 last_component = components[-1]
650 new_val = { }
651 if parent.has_key(last_component):
652 # The contract for copying over existing nodes is to do nothing
653 # and return:
654 if copyfrom_path:
655 if expected_entries:
656 return Change(OP_NOOP, old_names[0], old_names[1], [])
657 else:
658 return Change(OP_NOOP, old_names[0], old_names[1])
659 # else
660 op = OP_CHANGE
661 new_val = marshal.loads(self.nodes_db[parent[last_component]])
662 elif only_if_already_exists:
663 if expected_entries:
664 return Change(OP_NOOP, [], [], [])
665 else:
666 return Change(OP_NOOP, [], [])
668 leaf_key = gen_key()
669 deletions = []
670 actual_copy_rev = copyfrom_rev
671 if copyfrom_path:
672 new_val = self.probe_path(copyfrom_path, copyfrom_rev)
673 if new_val is None:
674 # Sometimes a branch is rooted in a revision that RCS has
675 # marked as 'dead'. Since that path will have been deleted in
676 # the corresponding Subversion revision, we use the revision
677 # right before it as the copyfrom rev, and return that to the
678 # caller so it can emit the right dumpfile instructions.
679 actual_copy_rev = copyfrom_rev - 1
680 new_val = self.probe_path(copyfrom_path, actual_copy_rev)
681 if expected_entries:
682 approved_entries = new_val.get(self.approved_entries) or { }
683 new_approved_entries = { }
684 for ent in new_val.keys():
685 if (ent[0] != '/'):
686 if (not expected_entries.has_key(ent)
687 and not approved_entries.has_key(ent)):
688 del new_val[ent]
689 deletions.append(ent)
690 else:
691 new_approved_entries[ent] = 1
692 new_val[self.approved_entries] = new_approved_entries
693 parent[last_component] = leaf_key
694 self.nodes_db[parent_key] = marshal.dumps(parent)
695 self.symroots_db[path] = marshal.dumps((tags, branches))
696 new_val[self.mutable_flag] = 1
697 self.nodes_db[leaf_key] = marshal.dumps(new_val)
699 if expected_entries:
700 return Change(op, old_names[0], old_names[1], deletions, actual_copy_rev)
701 else:
702 return Change(op, old_names[0], old_names[1], None, actual_copy_rev)
704 def delete_path(self, path, tags, branches, prune=None):
705 """Delete PATH from the tree. PATH may not have a leading slash.
707 Return a tuple (path_deleted, closed_tags, closed_branches), where
708 path_deleted is the path actually deleted or None if PATH did not
709 exist, and closed_tags and closed_branches are lists of symbolic
710 names closed off by this deletion -- that is, tags or branches
711 which could be rooted in the previous revision of PATH, but not in
712 this revision, because this rev changes PATH. If path_deleted is
713 None, then closed_tags and closed_branches will both be empty.
715 TAGS are any tags that sprout from this revision of PATH, BRANCHES
716 are any branches that sprout from this revision of PATH. (I can't
717 imagine that there are any of either, what to do if there are?)
719 If PRUNE is not None, then delete the highest possible directory,
720 which means the returned path may differ from PATH. In other
721 words, if PATH was the last entry in its parent, then delete
722 PATH's parent, unless it too is the last entry in *its* parent, in
723 which case delete that parent, and and so on up the chain, until a
724 directory is encountered that has an entry which is not a member
725 of the parent stack of the original target.
727 PRUNE is like the -P option to 'cvs checkout'."""
729 components = string.split(path, '/')
730 path_so_far = None
732 # Start out assuming that we will delete it. The for-loop may
733 # change this to None, if it turns out we can't even reach the
734 # path (i.e., it is already deleted).
735 retval = path
737 parent_key = self.revs_db[str(self.youngest)]
738 parent = marshal.loads(self.nodes_db[parent_key])
740 # As we walk down to find the dest, we remember each parent
741 # directory's name and db key, in reverse order: push each new key
742 # onto the front of the list, so that by the time we reach the
743 # destination node, the zeroth item in the list is the parent of
744 # that destination.
746 # Then if we actually do the deletion, we walk the list from left
747 # to right, replacing as appropriate.
749 # The root directory has name None.
750 parent_chain = [ ]
751 parent_chain.insert(0, (None, parent_key))
753 def is_prunable(dir):
754 """Return true if DIR, a dictionary representing a directory,
755 has just zero or one non-special entry, else return false.
756 (In a pure world, we'd just ask len(DIR) > 1; it's only
757 because the directory might have mutable flags and other special
758 entries that we need this function at all.)"""
759 num_items = len(dir)
760 if num_items > 3:
761 return None
762 if num_items == 3 or num_items == 2:
763 real_entries = 0
764 for key in dir.keys():
765 if not key[0] == '/': real_entries = real_entries + 1
766 if real_entries > 1:
767 return None
768 else:
769 return 1
770 else:
771 return 1
773 for component in components[:-1]:
774 # parent is always mutable at the top of the loop
776 if path_so_far:
777 path_so_far = path_so_far + '/' + component
778 else:
779 path_so_far = component
781 # If we can't reach the dest, then we don't need to do anything.
782 if not parent.has_key(component):
783 return None, [], []
785 # Otherwise continue downward, dropping breadcrumbs.
786 this_entry_key = parent[component]
787 this_entry_val = marshal.loads(self.nodes_db[this_entry_key])
788 parent_key = this_entry_key
789 parent = this_entry_val
790 parent_chain.insert(0, (component, parent_key))
792 # If the target is not present in its parent, then we're done.
793 last_component = components[-1]
794 old_names = [], []
795 if not parent.has_key(last_component):
796 return None, [], []
797 elif self.symroots_db.has_key(path):
798 old_names = marshal.loads(self.symroots_db[path])
799 del self.symroots_db[path]
801 # The target is present, so remove it and bubble up, making a new
802 # mutable path and/or pruning as necessary.
803 pruned_count = 0
804 prev_entry_name = last_component
805 new_key = None
806 for parent_item in parent_chain:
807 pkey = parent_item[1]
808 pval = marshal.loads(self.nodes_db[pkey])
809 if prune and (new_key is None) and is_prunable(pval):
810 pruned_count = pruned_count + 1
811 pass
812 # Do nothing more. All the action takes place when we hit a
813 # non-prunable parent.
814 else:
815 # We hit a non-prunable, or aren't pruning, so bubble up the new gospel.
816 pval[self.mutable_flag] = 1
817 if new_key is None:
818 del pval[prev_entry_name]
819 else:
820 pval[prev_entry_name] = new_key
821 new_key = gen_key()
823 prev_entry_name = parent_item[0]
824 if new_key:
825 self.nodes_db[new_key] = marshal.dumps(pval)
827 if new_key is None:
828 new_key = gen_key()
829 self.nodes_db[new_key] = marshal.dumps(self.empty_mutable_thang)
831 # Install the new root entry.
832 self.revs_db[str(self.youngest)] = new_key
834 if pruned_count > len(components):
835 sys.stderr.write("Error: deleting '%s' tried to prune %d components.\n"
836 % (path, pruned_count))
837 sys.exit(1)
839 if pruned_count:
840 if pruned_count == len(components):
841 # We never prune away the root directory, so back up one component.
842 pruned_count = pruned_count - 1
843 retpath = string.join(components[:0 - pruned_count], '/')
844 else:
845 retpath = path
847 return retpath, old_names[0], old_names[1]
849 ### We've no place to put tags + branches. Suspect we just
850 ### shouldn't be taking them as arguments, which the doc string
851 ### implies already. Ponder.
853 def close(self):
854 # Just stabilize the last revision. This may or may not affect
855 # anything, but if we end up using the mirror for anything after
856 # this, it's nice to know the '/mutable' entries are gone.
857 self.stabilize_youngest()
860 class Dumper:
861 def __init__(self, dumpfile_path):
862 'Open DUMPFILE_PATH, and initialize revision to REVISION.'
863 self.dumpfile_path = dumpfile_path
864 self.revision = 0
865 self.dumpfile = open(dumpfile_path, 'wb')
866 self.repos_mirror = RepositoryMirror()
868 # Initialize the dumpfile with the standard headers:
870 # The CVS repository doesn't have a UUID, and the Subversion
871 # repository will be created with one anyway. So when we load
872 # the dumpfile, we'll tell svnadmin to ignore the UUID below.
873 self.dumpfile.write('SVN-fs-dump-format-version: 2\n'
874 '\n')
876 def start_revision(self, props):
877 """Write the next revision, with properties, to the dumpfile.
878 Return the newly started revision."""
880 self.revision = self.revision + 1
882 # A revision typically looks like this:
884 # Revision-number: 1
885 # Prop-content-length: 129
886 # Content-length: 129
888 # K 7
889 # svn:log
890 # V 27
891 # Log message for revision 1.
892 # K 10
893 # svn:author
894 # V 7
895 # jrandom
896 # K 8
897 # svn:date
898 # V 27
899 # 2003-04-22T22:57:58.132837Z
900 # PROPS-END
902 # Notice that the length headers count everything -- not just the
903 # length of the data but also the lengths of the lengths, including
904 # the 'K ' or 'V ' prefixes.
906 # The reason there are both Prop-content-length and Content-length
907 # is that the former includes just props, while the latter includes
908 # everything. That's the generic header form for any entity in a
909 # dumpfile. But since revisions only have props, the two lengths
910 # are always the same for revisions.
912 # Calculate the total length of the props section.
913 total_len = 10 # len('PROPS-END\n')
914 for propname in props.keys():
915 klen = len(propname)
916 klen_len = len('K %d' % klen)
917 vlen = len(props[propname])
918 vlen_len = len('V %d' % vlen)
919 # + 4 for the four newlines within a given property's section
920 total_len = total_len + klen + klen_len + vlen + vlen_len + 4
922 # Print the revision header and props
923 self.dumpfile.write('Revision-number: %d\n'
924 'Prop-content-length: %d\n'
925 'Content-length: %d\n'
926 '\n'
927 % (self.revision, total_len, total_len))
929 for propname in props.keys():
930 self.dumpfile.write('K %d\n'
931 '%s\n'
932 'V %d\n'
933 '%s\n' % (len(propname),
934 propname,
935 len(props[propname]),
936 props[propname]))
938 self.dumpfile.write('PROPS-END\n')
939 self.dumpfile.write('\n')
941 self.repos_mirror.new_revision()
942 return self.revision
944 def add_dir(self, path):
945 self.dumpfile.write("Node-path: %s\n"
946 "Node-kind: dir\n"
947 "Node-action: add\n"
948 "Prop-content-length: 10\n"
949 "Content-length: 10\n"
950 "\n"
951 "PROPS-END\n"
952 "\n"
953 "\n" % path)
955 def probe_path(self, path):
956 """Return true if PATH exists in the youngest tree of the svn
957 repository, else return None. PATH does not start with '/'."""
958 if self.repos_mirror.probe_path(path) is None:
959 return None
960 else:
961 return 1
963 def copy_path(self, svn_src_path, svn_src_rev, svn_dst_path, entries=None):
964 """Emit a copy of SVN_SRC_PATH at SVN_SRC_REV to SVN_DST_PATH.
965 If ENTRIES is not None, it is a dictionary whose keys are the full
966 set of entries the new copy is expected to have -- and therefore
967 any entries in the new dst but not in ENTRIES will be removed.
968 (Keys in ENTRIES beginning with '/' are ignored.)
970 No action is taken for keys in ENTRIES but not in the dst; it is
971 assumed that the caller will compensate for these by calling
972 copy_path again with other arguments."""
973 change = self.repos_mirror.change_path(svn_dst_path,
974 [], [],
975 self.add_dir,
976 svn_src_path, svn_src_rev,
977 entries)
978 if change.op == 'A':
979 # We don't need to include "Node-kind:" for copies; the loader
980 # ignores it anyway and just uses the source kind instead.
981 self.dumpfile.write('Node-path: %s\n'
982 'Node-action: add\n'
983 'Node-copyfrom-rev: %d\n'
984 'Node-copyfrom-path: /%s\n'
985 '\n'
986 % (svn_dst_path, change.copyfrom_rev, svn_src_path))
988 for ent in change.deleted_entries:
989 self.dumpfile.write('Node-path: %s\n'
990 'Node-action: delete\n'
991 '\n' % (svn_dst_path + '/' + ent))
993 def prune_entries(self, path, expected):
994 """Delete any entries in PATH that are not in list EXPECTED.
995 PATH need not be a directory, but of course nothing will happen if
996 it's a file. Entries beginning with '/' are ignored as usual."""
997 change = self.repos_mirror.change_path(path,
998 [], [],
999 self.add_dir,
1000 None, None,
1001 expected, 1)
1002 for ent in change.deleted_entries:
1003 self.dumpfile.write('Node-path: %s\n'
1004 'Node-action: delete\n'
1005 '\n' % (path + '/' + ent))
1007 def add_or_change_path(self, cvs_path, svn_path, cvs_rev, rcs_file,
1008 tags, branches):
1010 # figure out the real file path for "co"
1011 try:
1012 f_st = os.stat(rcs_file)
1013 except os.error:
1014 dirname, fname = os.path.split(rcs_file)
1015 rcs_file = os.path.join(dirname, 'Attic', fname)
1016 f_st = os.stat(rcs_file)
1018 if f_st[0] & stat.S_IXUSR:
1019 is_executable = 1
1020 # "K 14\n" + "svn:executable\n" + "V 1\n" + "*\n" + "PROPS-END\n"
1021 props_len = 36
1022 else:
1023 is_executable = 0
1024 # just "PROPS-END\n"
1025 props_len = 10
1027 ### FIXME: We ought to notice the -kb flag set on the RCS file and
1028 ### use it to set svn:mime-type.
1030 basename = os.path.basename(rcs_file[:-2])
1031 pipe = os.popen('co -q -p%s \'%s\''
1032 % (cvs_rev, rcs_file.replace("'", "'\\''")), 'r')
1034 # You might think we could just test
1036 # if cvs_rev[-2:] == '.1':
1038 # to determine if this path exists in head yet. But that wouldn't
1039 # be perfectly reliable, both because of 'cvs commit -r', and also
1040 # the possibility of file resurrection.
1041 change = self.repos_mirror.change_path(svn_path, tags, branches,
1042 self.add_dir)
1044 if change.op == OP_ADD:
1045 action = 'add'
1046 else:
1047 action = 'change'
1049 self.dumpfile.write('Node-path: %s\n'
1050 'Node-kind: file\n'
1051 'Node-action: %s\n'
1052 'Prop-content-length: %d\n'
1053 'Text-content-length: '
1054 % (svn_path, action, props_len))
1056 pos = self.dumpfile.tell()
1058 self.dumpfile.write('0000000000000000\n'
1059 'Text-content-md5: 00000000000000000000000000000000\n'
1060 'Content-length: 0000000000000000\n'
1061 '\n')
1063 if is_executable:
1064 self.dumpfile.write('K 14\n'
1065 'svn:executable\n'
1066 'V 1\n'
1067 '*\n')
1069 self.dumpfile.write('PROPS-END\n')
1071 # Insert the rev contents, calculating length and checksum as we go.
1072 checksum = md5.new()
1073 length = 0
1074 buf = pipe.read()
1075 while buf:
1076 checksum.update(buf)
1077 length = length + len(buf)
1078 self.dumpfile.write(buf)
1079 buf = pipe.read()
1080 pipe.close()
1082 # Go back to patch up the length and checksum headers:
1083 self.dumpfile.seek(pos, 0)
1084 # We left 16 zeros for the text length; replace them with the real
1085 # length, padded on the left with spaces:
1086 self.dumpfile.write('%16d' % length)
1087 # 16... + 1 newline + len('Text-content-md5: ') == 35
1088 self.dumpfile.seek(pos + 35, 0)
1089 self.dumpfile.write(checksum.hexdigest())
1090 # 35... + 32 bytes of checksum + 1 newline + len('Content-length: ') == 84
1091 self.dumpfile.seek(pos + 84, 0)
1092 # The content length is the length of property data, text data,
1093 # and any metadata around/inside around them.
1094 self.dumpfile.write('%16d' % (length + props_len))
1095 # Jump back to the end of the stream
1096 self.dumpfile.seek(0, 2)
1098 # This record is done.
1099 self.dumpfile.write('\n')
1100 return change.closed_tags, change.closed_branches
1102 def delete_path(self, svn_path, tags, branches, prune=None):
1103 """If SVN_PATH exists in the head mirror, output the deletion to
1104 the dumpfile, else output nothing to the dumpfile.
1106 Return a tuple (path_deleted, closed_tags, closed_branches), where
1107 path_deleted is the path deleted if any or None if no deletion was
1108 necessary, and closed_tags and closed_names are lists of symbolic
1109 names closed off by this deletion -- that is, tags or branches
1110 which could be rooted in the previous revision of PATH, but not in
1111 this revision, because this rev changes PATH. If path_deleted is
1112 None, then closed_tags and closed_branches will both be empty.
1114 Iff PRUNE is true, then the path deleted can be not None, yet
1115 shorter than SVN_PATH because of pruning."""
1116 deleted_path, closed_tags, closed_branches \
1117 = self.repos_mirror.delete_path(svn_path, tags,
1118 branches, prune)
1119 if deleted_path:
1120 print ' (deleted %s)' % deleted_path
1121 self.dumpfile.write('Node-path: %s\n'
1122 'Node-action: delete\n'
1123 '\n' % deleted_path)
1124 return deleted_path, closed_tags, closed_branches
1126 def close(self):
1127 self.repos_mirror.close()
1128 self.dumpfile.close()
1131 def format_date(date):
1132 """Return an svn-compatible date string for DATE (seconds since epoch)."""
1133 # A Subversion date looks like "2002-09-29T14:44:59.000000Z"
1134 return time.strftime("%Y-%m-%dT%H:%M:%S.000000Z", time.gmtime(date))
1137 def make_revision_props(symbolic_name, is_tag):
1138 """Return a dictionary of revision properties for the manufactured
1139 commit that finished SYMBOLIC_NAME. If IS_TAG is true, write the
1140 log message as though for a tag, else as though for a branch."""
1141 if is_tag:
1142 type = 'tag'
1143 else:
1144 type = 'branch'
1146 # In Python 2.2.3, we could use textwrap.fill(). Oh well :-).
1147 if len(symbolic_name) >= 13:
1148 space_or_newline = '\n'
1149 else:
1150 space_or_newline = ' '
1152 log = "This commit was manufactured by cvs2svn to create %s%s'%s'." \
1153 % (type, space_or_newline, symbolic_name)
1155 return { 'svn:author' : 'unknown',
1156 'svn:log' : log,
1157 'svn:date' : format_date(time.time())}
1160 class SymbolicNameTracker:
1161 """Track the Subversion path/revision ranges of CVS symbolic names.
1162 This is done in a .db file, representing a tree in the usual way.
1163 In addition to directory entries, each object in the database stores
1164 the earliest revision from which it could be copied, and the first
1165 revision from which it could no longer be copied. Intermediate
1166 directories go one step farther: they record counts for the various
1167 revisions from which items under them could have been copied, and
1168 counts for the cutoff revisions. For example:
1170 .----------.
1171 | sub1 | [(2, 1), (3, 3)]
1172 | / | [(5, 1), (17, 2), (50, 1)]
1173 | / |
1174 |/ sub2 |
1175 / \ |
1176 /|_____\____|
1177 / \
1178 ______/ \_________
1179 / \
1180 / \
1181 / \
1182 .---------. .---------.
1183 | file1 | | file3 |
1184 | / | [(3, 2)] | \ | [(2, 1), (3, 1)]
1185 | / | [(17, 1), (50, 1)] | \ | [(5, 1), (10, 1)]
1186 | / | | \ |
1187 |/ file2 | | file4 \|
1188 / \ | | / \
1189 /|_____\___| |___/_____|\
1190 / \ / \
1191 / \ / \
1192 / \ / \
1193 / + / +
1194 +======+ | +======+ |
1195 | | [(3, 1)] | | | [(2, 1)] |
1196 | | [(17, 1)] | | | [(5, 1)] |
1197 | | | | | |
1198 +======+ | +======+ |
1199 +======+ +======+
1200 | | [(3, 1)] | | [(3, 1)]
1201 | | [(50, 1)] | | [(17, 1)]
1202 | | | |
1203 +======+ +======+
1205 The two lists to the right of each node represent the 'opening' and
1206 'closing' revisions respectively. Each tuple in a list is of the
1207 form (REV, COUNT). For leaf nodes, COUNT is always 1, of course.
1208 For intermediate nodes, the counts are the sums of the corresponding
1209 counts of child nodes.
1211 These revision scores are used to determine the optimal copy
1212 revisions for each tree/subtree at branch or tag creation time.
1214 The svn path input will most often be a trunk path, because the
1215 path/rev information recorded here is about where and when the given
1216 symbolic name could be rooted, *not* a path/rev for which commits
1217 along that symbolic name take place (of course, commits only happen on
1218 branches anyway)."""
1220 def __init__(self):
1221 self.db_file = SYMBOLIC_NAMES_DB
1222 self.db = anydbm.open(self.db_file, 'n')
1223 self.root_key = gen_key()
1224 self.db[self.root_key] = marshal.dumps({})
1226 # The keys for the opening and closing revision lists attached to
1227 # each directory or file. Includes "/" so as never to conflict
1228 # with any real entry.
1229 self.tags_opening_revs_key = "/tag-openings"
1230 self.tags_closing_revs_key = "/tag-closings"
1231 self.br_opening_revs_key = "/br-openings"
1232 self.br_closing_revs_key = "/br-closings"
1234 # When a node is copied into the repository, the revision copied
1235 # is stored under the appropriate key, and the corresponding
1236 # opening and closing rev lists are removed.
1237 self.tags_copyfrom_rev_key = "/tags-copyfrom-rev"
1238 self.br_copyfrom_rev_key = "/br-copyfrom-rev"
1240 def probe_path(self, symbolic_name, path, debugging=None):
1241 """If 'SYMBOLIC_NAME/PATH' exists in the symbolic name tree,
1242 return the value of its last component, else return None.
1243 PATH may be None, but may not start with '/'.
1244 If DEBUGGING is true, then print trace output to stdout."""
1245 if path:
1246 components = [symbolic_name] + string.split(path, '/')
1247 else:
1248 components = [symbolic_name]
1250 if debugging:
1251 print "PROBING SYMBOLIC NAME:\n", components
1253 parent_key = self.root_key
1254 parent = marshal.loads(self.db[parent_key])
1255 last_component = "/"
1256 i = 1
1257 for component in components:
1258 if debugging:
1259 print " " * i,
1260 print "'%s' key: %s, val:" % (last_component, parent_key), parent
1262 if not parent.has_key(component):
1263 sys.stderr.write("SYM PROBE FAILED: '%s' does not contain '%s'\n" \
1264 % (last_component, component))
1265 sys.exit(1)
1267 this_entry_key = parent[component]
1268 this_entry_val = marshal.loads(self.db[this_entry_key])
1269 parent_key = this_entry_key
1270 parent = this_entry_val
1271 last_component = component
1272 i = i + 1
1274 if debugging:
1275 print " " * i,
1276 print "parent_key: %s, val:" % parent_key, parent
1278 # It's not actually a parent at this point, it's the leaf node.
1279 return parent
1281 def bump_rev_count(self, item_key, rev, revlist_key):
1282 """Increment REV's count in opening or closing list under KEY.
1283 REVLIST_KEY is self.*_opening_revs_key or self.*_closing_revs_key,
1284 and indicates which rev list to increment REV's count in.
1286 For example, if REV is 7, REVLIST_KEY is
1287 self.tags_opening_revs_key, and the entry's tags opening revs list
1288 looks like this
1290 [(2, 5), (7, 2), (10, 15)]
1292 then afterwards it would look like this:
1294 [(2, 5), (7, 3), (10, 15)]
1296 But if no tuple for revision 7 were present, then one would be
1297 added, for example
1299 [(2, 5), (10, 15)]
1301 would become
1303 [(2, 5), (7, 1), (10, 15)]
1305 The list is sorted by ascending revision both before and after."""
1307 entry_val = marshal.loads(self.db[item_key])
1309 if not entry_val.has_key(revlist_key):
1310 entry_val[revlist_key] = [(rev, 1)]
1311 else:
1312 rev_counts = entry_val[revlist_key]
1313 for i in range(len(rev_counts)):
1314 this_rev, this_count = rev_counts[i]
1315 if rev == this_rev:
1316 rev_counts[i] = (this_rev, this_count + 1)
1317 break
1318 elif this_rev > rev:
1319 if i > 0:
1320 i = i - 1
1321 rev_counts.insert(i, (rev, 1))
1322 break
1323 else:
1324 rev_counts.append((rev, 1))
1325 entry_val[revlist_key] = rev_counts
1327 self.db[item_key] = marshal.dumps(entry_val)
1329 # The verb form of "root" is "root", but that would be misleading in
1330 # this case; and the opposite of "uproot" is presumably "downroot",
1331 # but that wouldn't exactly clarify either. Hence, "enroot" :-).
1332 def enroot_names(self, svn_path, svn_rev, names, opening_key):
1333 """Record SVN_PATH at SVN_REV as the earliest point from which the
1334 symbolic names in NAMES could be copied. OPENING_KEY is
1335 self.tags_opening_revs_key or self.br_opening_revs_key, to
1336 indicate whether NAMES contains tag names or branch names.
1337 SVN_PATH does not start with '/'."""
1339 # Guard against names == None
1340 if not names:
1341 return
1343 for name in names:
1344 components = [name] + string.split(svn_path, '/')
1345 parent_key = self.root_key
1346 for component in components:
1347 self.bump_rev_count(parent_key, svn_rev, opening_key)
1348 parent = marshal.loads(self.db[parent_key])
1349 if not parent.has_key(component):
1350 new_child_key = gen_key()
1351 parent[component] = new_child_key
1352 self.db[new_child_key] = marshal.dumps({})
1353 self.db[parent_key] = marshal.dumps(parent)
1354 # One way or another, parent now has an entry for component.
1355 this_entry_key = parent[component]
1356 this_entry_val = marshal.loads(self.db[this_entry_key])
1357 # Swaparoo.
1358 parent_key = this_entry_key
1359 parent = this_entry_val
1361 self.bump_rev_count(parent_key, svn_rev, opening_key)
1363 def enroot_tags(self, svn_path, svn_rev, tags):
1364 """Record SVN_PATH at SVN_REV as the earliest point from which the
1365 symbolic names in TAGS could be copied. SVN_PATH does not start
1366 with '/'."""
1367 self.enroot_names(svn_path, svn_rev, tags, self.tags_opening_revs_key)
1369 def enroot_branches(self, svn_path, svn_rev, branches):
1370 """Record SVN_PATH at SVN_REV as the earliest point from which the
1371 symbolic names in BRANCHES could be copied. SVN_PATH does not
1372 start with '/'."""
1373 self.enroot_names(svn_path, svn_rev, branches, self.br_opening_revs_key)
1375 def close_names(self, svn_path, svn_rev, names, closing_key):
1376 """Record that as of SVN_REV, SVN_PATH could no longer be the
1377 source from which any of symbolic names in NAMES could be copied.
1378 CLOSING_KEY is self.tags_closing_revs_key or
1379 self.br_closing_revs_key, to indicate whether NAMES are tags or
1380 branches. SVN_PATH does not start with '/'."""
1382 # Guard against names == None
1383 if not names:
1384 return
1386 for name in names:
1387 components = [name] + string.split(svn_path, '/')
1388 parent_key = self.root_key
1389 for component in components:
1390 self.bump_rev_count(parent_key, svn_rev, closing_key)
1391 parent = marshal.loads(self.db[parent_key])
1392 if not parent.has_key(component):
1393 sys.stderr.write("In path '%s', value for parent key '%s' "
1394 "does not have entry '%s'\n" \
1395 % (svn_path, parent_key, component))
1396 sys.exit(1)
1397 this_entry_key = parent[component]
1398 this_entry_val = marshal.loads(self.db[this_entry_key])
1399 # Swaparoo.
1400 parent_key = this_entry_key
1401 parent = this_entry_val
1403 self.bump_rev_count(parent_key, svn_rev, closing_key)
1405 def close_tags(self, svn_path, svn_rev, tags):
1406 """Record that as of SVN_REV, SVN_PATH could no longer be the
1407 source from which any of TAGS could be copied. SVN_PATH does not
1408 start with '/'."""
1409 self.close_names(svn_path, svn_rev, tags, self.tags_closing_revs_key)
1411 def close_branches(self, svn_path, svn_rev, branches):
1412 """Record that as of SVN_REV, SVN_PATH could no longer be the
1413 source from which any of BRANCHES could be copied. SVN_PATH does
1414 not start with '/'."""
1415 self.close_names(svn_path, svn_rev, branches, self.br_closing_revs_key)
1417 def score_revisions(self, openings, closings):
1418 """Return a list of revisions and scores based on OPENINGS and
1419 CLOSINGS. The returned list looks like:
1421 [(REV1 SCORE1), (REV2 SCORE2), ...]
1423 where REV2 > REV1 and all scores are > 0. OPENINGS and CLOSINGS
1424 are the values of self.tags_opening_revs_key and
1425 self.tags_closing_revs_key, or self.br_opening_revs_key and
1426 self.br_closing_revs_key, from some file or directory node, or
1427 else None.
1429 Each score indicates that copying the corresponding revision of
1430 the object in question would yield that many correct paths at or
1431 underneath the object. There may be other paths underneath it
1432 which are not correct and need to be deleted or recopied; those
1433 can only be detected by descending and examining their scores.
1435 If OPENINGS is false, return the empty list."""
1437 # First look for easy outs.
1438 if not openings:
1439 return []
1441 # Must be able to call len(closings) below.
1442 if closings is None:
1443 closings = []
1445 # No easy out, so wish for lexical closures and calculate the scores :-).
1446 scores = []
1447 opening_score_accum = 0
1448 for i in range(len(openings)):
1449 pair = openings[i]
1450 opening_score_accum = opening_score_accum + pair[1]
1451 scores.append((pair[0], opening_score_accum))
1452 min = 0
1453 for i in range(len(closings)):
1454 closing_rev = closings[i][0]
1455 closing_score = closings[i][1]
1456 for j in range(min, len(scores)):
1457 opening_pair = scores[j]
1458 if closing_rev <= opening_pair[0]:
1459 scores[j] = (opening_pair[0], opening_pair[1] - closing_score)
1460 else:
1461 min = j + 1
1462 return scores
1464 def best_rev(self, scores):
1465 """Return the revision with the highest score from SCORES, a list
1466 returned by score_revisions()."""
1467 max_score = 0
1468 rev = SVN_INVALID_REVNUM
1469 for pair in scores:
1470 if pair[1] > max_score:
1471 max_score = pair[1]
1472 rev = pair[0]
1473 return rev
1475 # Helper for fill_branch().
1476 def copy_descend(self, dumper, ctx, name, parent, entry_name,
1477 parent_rev, src_path, dst_path, is_tag, jit_new_rev=None):
1478 """Starting with ENTRY_NAME in directory object PARENT at
1479 PARENT_REV, use DUMPER and CTX to copy nodes in the Subversion
1480 repository, manufacturing the source paths with SRC_PATH and the
1481 destination paths with NAME and DST_PATH.
1483 If IS_TAG is true, NAME is treated as a tag, else as a branch.
1485 If JIT_NEW_REV is not None, it is a list of one element. If that
1486 element is true, then if any copies are to be made, invoke
1487 DUMPER.start_revision() before the first copy, then set
1488 JIT_NEW_REV[0] to None, so no more new revisions are made for this
1489 symbolic name anywhere in this descent.
1491 ('JIT' == 'Just In Time'.)"""
1492 ### Hmmm, is passing [1] instead of 1 an idiomatic way of passing
1493 ### a side-effectable boolean in Python? That's how the
1494 ### JIT_NEW_REV parameter works here and elsewhere, but maybe
1495 ### there's a clearer way to do it?
1497 key = parent[entry_name]
1498 val = marshal.loads(self.db[key])
1500 if is_tag:
1501 opening_key = self.tags_opening_revs_key
1502 closing_key = self.tags_closing_revs_key
1503 copyfrom_rev_key = self.tags_copyfrom_rev_key
1504 else:
1505 opening_key = self.br_opening_revs_key
1506 closing_key = self.br_closing_revs_key
1507 copyfrom_rev_key = self.br_copyfrom_rev_key
1509 if not val.has_key(copyfrom_rev_key):
1510 # If not already copied this subdir, calculate its "best rev"
1511 # and see if it differs from parent's best rev.
1512 scores = self.score_revisions(val.get(opening_key), val.get(closing_key))
1513 rev = self.best_rev(scores)
1515 if rev == SVN_INVALID_REVNUM:
1516 return # name is a branch, but we're doing a tag, or vice versa
1518 else:
1519 if is_tag:
1520 copy_dst = make_path(ctx, dst_path, None, name)
1521 else:
1522 copy_dst = make_path(ctx, dst_path, name, None)
1524 if (rev != parent_rev):
1525 parent_rev = rev
1526 if jit_new_rev and jit_new_rev[0]:
1527 dumper.start_revision(make_revision_props(name, is_tag))
1528 jit_new_rev[0] = None
1529 dumper.copy_path(src_path, parent_rev, copy_dst, val)
1530 # Record that this copy is done:
1531 val[copyfrom_rev_key] = parent_rev
1532 if val.has_key(opening_key):
1533 del val[opening_key]
1534 if val.has_key(closing_key):
1535 del val[closing_key]
1536 self.db[key] = marshal.dumps(val)
1537 else:
1538 # Even if we kept the already-present revision of this entry
1539 # instead of copying a new one, we still need to prune out
1540 # anything that's not part of the symbolic name.
1541 dumper.prune_entries(copy_dst, val)
1543 for ent in val.keys():
1544 if not ent[0] == '/':
1545 if src_path:
1546 next_src = src_path + '/' + ent
1547 else:
1548 next_src = ent
1549 if dst_path:
1550 next_dst = dst_path + '/' + ent
1551 else:
1552 next_dst = ent
1553 self.copy_descend(dumper, ctx, name, val, ent, parent_rev,
1554 next_src, next_dst, is_tag, jit_new_rev)
1556 def fill_name(self, dumper, ctx, name, is_tag, jit_new_rev=None):
1557 """Use DUMPER to create all currently available parts of symbolic
1558 name NAME that have not been created already.
1560 If IS_TAG is true, NAME is treated as a tag, else as a branch.
1562 If JIT_NEW_REV is not None, it is a list of one element. If that
1563 element is true, then if any copies are to be made, invoke
1564 DUMPER.start_revision() before the first copy.
1566 ('JIT' == 'Just In Time'.)"""
1568 # A source path looks like this in the symbolic name tree:
1570 # thisbranch/trunk/proj/foo/bar/baz.c
1572 # ...or occasionally...
1574 # thisbranch/branches/sourcebranch/proj/foo/bar/baz.c
1576 # (the latter when 'thisbranch' is branched off 'sourcebranch').
1578 # Meanwhile, we're copying to a location in the repository like
1580 # /branches/thisbranch/proj/foo/bar/baz.c or
1581 # /tags/tagname/proj/foo/bar/baz.c
1583 # Of course all this depends on make_path()'s behavior. At
1584 # various times we've changed the way it produces paths (see
1585 # revisions 6028 and 6347). If it changes again, the logic here
1586 # must be adjusted to match.
1588 parent_key = self.root_key
1589 parent = marshal.loads(self.db[parent_key])
1591 if not parent.has_key(name):
1592 if is_tag:
1593 sys.stderr.write("No origin records for tag '%s'.\n" % name)
1594 else:
1595 sys.stderr.write("No origin records for branch '%s'.\n" % name)
1596 sys.exit(1)
1598 parent_key = parent[name]
1599 parent = marshal.loads(self.db[parent_key])
1601 # All Subversion source paths under the branch start with one of
1602 # three things:
1604 # /trunk/...
1605 # /branches/foo/...
1606 # /tags/foo/...
1608 # (We don't care what foo is, it's just a component to skip over.)
1610 # Since these don't all have the same number of components, we
1611 # manually descend into each as far as necessary, then invoke
1612 # copy_descend() once we're in the right place in both trees.
1614 # Since it's possible for a branch or tag to have some source
1615 # paths on trunk and some on branches, there's some question about
1616 # what to copy as the top-level directory of the branch. Our
1617 # solution is to [somewhat randomly] give preference to trunk.
1618 # Note that none of these paths can ever conflict; for example,
1619 # it would be impossible to have both
1621 # thisbranch/trunk/myproj/lib/drivers.c and
1622 # thisbranch/branches/sourcebranch/myproj/lib/drivers.c
1624 # because that would imply that the symbolic name 'thisbranch'
1625 # appeared twice in the RCS file header, referring to two
1626 # different revisions. Well, I suppose that's *possible*, but its
1627 # effect is undefined, and it's as reasonable for us to just
1628 # overwrite one with the other as anything else -- anyway, isn't
1629 # that what CVS would do if you checked out the branch? <shrug>
1631 if parent.has_key(ctx.trunk_base):
1632 self.copy_descend(dumper, ctx, name, parent, ctx.trunk_base,
1633 SVN_INVALID_REVNUM, ctx.trunk_base, "",
1634 is_tag, jit_new_rev)
1635 if parent.has_key(ctx.branches_base):
1636 branch_base_key = parent[ctx.branches_base]
1637 branch_base = marshal.loads(self.db[branch_base_key])
1638 for this_source in branch_base.keys():
1639 # We skip special names beginning with '/' for the usual
1640 # reason. We skip cases where (this_source == name) for a
1641 # different reason: if a CVS branch were rooted in itself,
1642 # that would imply that the same symbolic name appeared on two
1643 # different branches in an RCS file, which CVS doesn't
1644 # permit. So while it wouldn't hurt to descend, it would be a
1645 # waste of time.
1646 if (this_source[0] != '/') and (this_source != name):
1647 src_path = ctx.branches_base + '/' + this_source
1648 self.copy_descend(dumper, ctx, name, branch_base, this_source,
1649 SVN_INVALID_REVNUM, src_path, "",
1650 is_tag, jit_new_rev)
1652 def fill_tag(self, dumper, ctx, tag, jit_new_rev=None):
1653 """Use DUMPER to create all currently available parts of TAG that
1654 have not been created already. Use CTX.trunk_base, CTX.tags_base,
1655 and CTX.branches_base to determine the source and destination
1656 paths in the Subversion repository.
1658 If JIT_NEW_REV is not None, it is a list of one element. If that
1659 element is true, then if any copies are to be made, invoke
1660 DUMPER.start_revision() before the first copy.
1662 ('JIT' == 'Just In Time'.)"""
1663 self.fill_name(dumper, ctx, tag, 1, jit_new_rev)
1665 def fill_branch(self, dumper, ctx, branch, jit_new_rev=None):
1666 """Use DUMPER to create all currently available parts of BRANCH that
1667 haven't been created already. Use CTX.trunk_base, CTX.tags_base,
1668 and CTX.branches_base to determine the source and destination
1669 paths in the Subversion repository.
1671 If JIT_NEW_REV is not None, it is a list of one element. If that
1672 element is true, then if any copies are to be made, invoke
1673 DUMPER.start_revision() before the first copy.
1675 ('JIT' == 'Just In Time'.)"""
1676 self.fill_name(dumper, ctx, branch, None, jit_new_rev)
1678 def finish(self, dumper, ctx):
1679 """Use DUMPER to finish branches and tags that have either
1680 not been created yet, or have been only partially created.
1681 Use CTX.trunk_base, CTX.tags_base, and CTX.branches_base to
1682 determine the source and destination paths in the Subversion
1683 repository."""
1684 parent_key = self.root_key
1685 parent = marshal.loads(self.db[parent_key])
1686 # Do all branches first, then all tags. We don't bother to check
1687 # here whether a given name is a branch or a tag, or is done
1688 # already; the fill_foo() methods will just do nothing if there's
1689 # nothing to do.
1691 # We do one revision per branch or tag, for clarity to users, not
1692 # for correctness. In CVS, when you make a branch off a branch,
1693 # the new branch will just root itself in the roots of the old
1694 # branch *except* where the new branch sprouts from a revision
1695 # that was actually committed on the old branch. In the former
1696 # cases, the source paths will be the same as the source paths
1697 # from which the old branch was created and therefore will already
1698 # exist; and in the latter case, the source paths will actually be
1699 # on the old branch, but those paths will exist already because
1700 # they were commits on that branch and therefore cvs2svn must have
1701 # created it already (see the fill_branch call in Commit.commit).
1702 # So either way, the source paths exist by the time we need them.
1704 ### It wouldn't be so awfully hard to determine whether a name is
1705 ### just a branch or just a tag, which would allow for more
1706 ### intuitive messages below.
1707 if not ctx.trunk_only:
1708 print "Finishing branches:"
1709 for name in parent.keys():
1710 if name[0] != '/':
1711 print "finishing '%s' as branch" % name
1712 self.fill_branch(dumper, ctx, name, [1])
1713 print "Finishing tags:"
1714 for name in parent.keys():
1715 if name[0] != '/':
1716 print "finishing '%s' as tag" % name
1717 self.fill_tag(dumper, ctx, name, [1])
1720 class Commit:
1721 def __init__(self):
1722 self.files = { }
1723 self.changes = [ ]
1724 self.deletes = [ ]
1725 self.t_min = 1<<30
1726 self.t_max = 0
1728 def has_file(self, fname):
1729 return self.files.has_key(fname)
1731 def add(self, t, op, file, rev, branch_name, tags, branches):
1732 # Record the time range of this commit.
1734 # ### ISSUE: It's possible, though unlikely, that the time range
1735 # of a commit could get gradually expanded to be arbitrarily
1736 # longer than COMMIT_THRESHOLD. I'm not sure this is a huge
1737 # problem, and anyway deciding where to break it up would be a
1738 # judgement call. For now, we just print a warning in commit() if
1739 # this happens.
1740 if t < self.t_min:
1741 self.t_min = t
1742 if t > self.t_max:
1743 self.t_max = t
1745 if op == OP_CHANGE:
1746 self.changes.append((file, rev, branch_name, tags, branches))
1747 else:
1748 # OP_DELETE
1749 self.deletes.append((file, rev, branch_name, tags, branches))
1750 self.files[file] = 1
1752 def get_metadata(self):
1753 # by definition, the author and log message must be the same for all
1754 # items that went into this commit. therefore, just grab any item from
1755 # our record of changes/deletes.
1756 if self.changes:
1757 file, rev, br, tags, branches = self.changes[0]
1758 else:
1759 # there better be one...
1760 file, rev, br, tags, branches = self.deletes[0]
1762 # now, fetch the author/log from the ,v file
1763 rip = RevInfoParser()
1764 rip.parse_cvs_file(file)
1765 author = rip.authors[rev]
1766 log = rip.logs[rev]
1767 # and we already have the date, so just format it
1768 date = format_date(self.t_max)
1770 return author, log, date
1772 def commit(self, dumper, ctx, sym_tracker):
1773 # commit this transaction
1774 seconds = self.t_max - self.t_min
1775 print 'committing: %s, over %d seconds' % (time.ctime(self.t_min), seconds)
1776 if seconds > COMMIT_THRESHOLD:
1777 print 'WARNING: commit spans more than %d seconds' % COMMIT_THRESHOLD
1779 if ctx.dry_run:
1780 for f, r, br, tags, branches in self.changes:
1781 # compute a repository path, dropping the ,v from the file name
1782 svn_path = make_path(ctx, relative_name(ctx.cvsroot, f[:-2]), br)
1783 print ' adding or changing %s : %s' % (r, svn_path)
1784 for f, r, br, tags, branches in self.deletes:
1785 # compute a repository path, dropping the ,v from the file name
1786 svn_path = make_path(ctx, relative_name(ctx.cvsroot, f[:-2]), br)
1787 print ' deleting %s : %s' % (r, svn_path)
1788 print ' (skipped; dry run enabled)'
1789 return
1791 do_copies = [ ]
1793 # get the metadata for this commit
1794 author, log, date = self.get_metadata()
1795 try:
1796 ### FIXME: The 'replace' behavior should be an option, like
1797 ### --encoding is.
1798 unicode_author = unicode(author, ctx.encoding, 'replace')
1799 unicode_log = unicode(log, ctx.encoding, 'replace')
1800 props = { 'svn:author' : unicode_author.encode('utf8'),
1801 'svn:log' : unicode_log.encode('utf8'),
1802 'svn:date' : date }
1803 except UnicodeError:
1804 print 'Problem encoding author or log message:'
1805 print " author: '%s'" % author
1806 print " log: '%s'" % log
1807 print " date: '%s'" % date
1808 for rcs_file, cvs_rev, br, tags, branches in self.changes:
1809 print " rev %s of '%s'" % (cvs_rev, rcs_file)
1810 print 'Try rerunning with (for example) \"--encoding=latin1\".'
1811 sys.exit(1)
1813 # Tells whether we actually wrote anything to the dumpfile.
1814 svn_rev = SVN_INVALID_REVNUM
1816 for rcs_file, cvs_rev, br, tags, branches in self.changes:
1817 # compute a repository path, dropping the ,v from the file name
1818 cvs_path = relative_name(ctx.cvsroot, rcs_file[:-2])
1819 svn_path = make_path(ctx, cvs_path, br)
1820 if svn_rev == SVN_INVALID_REVNUM:
1821 svn_rev = dumper.start_revision(props)
1822 sym_tracker.enroot_tags(svn_path, svn_rev, tags)
1823 sym_tracker.enroot_branches(svn_path, svn_rev, branches)
1824 if br:
1825 ### FIXME: Here is an obvious optimization point. Probably
1826 ### dump.probe_path(PATH) is kind of slow, because it does N
1827 ### database lookups for the N components in PATH. If this
1828 ### turns out to be a performance bottleneck, we can just
1829 ### maintain a database mirroring just the head tree, but
1830 ### keyed on full paths, to reduce the check to a quick
1831 ### constant time query.
1832 if not dumper.probe_path(svn_path):
1833 sym_tracker.fill_branch(dumper, ctx, br)
1834 # The first revision on a vendor branch is always the same as
1835 # the revision from which the branch sprouts, e.g., 1.1.1.1 is
1836 # always the same as 1.1, so there's no need to further modify
1837 # 1.1.1.1 from however it is in the copy from 1.1.
1838 if not (br and is_vendor_first_revision(cvs_rev)):
1839 print ' adding or changing %s : %s' % (cvs_rev, svn_path)
1840 closed_tags, closed_branches = dumper.add_or_change_path(cvs_path,
1841 svn_path,
1842 cvs_rev,
1843 rcs_file,
1844 tags,
1845 branches)
1846 sym_tracker.close_tags(svn_path, svn_rev, closed_tags)
1847 sym_tracker.close_branches(svn_path, svn_rev, closed_branches)
1849 for rcs_file, cvs_rev, br, tags, branches in self.deletes:
1850 # compute a repository path, dropping the ,v from the file name
1851 cvs_path = relative_name(ctx.cvsroot, rcs_file[:-2])
1852 svn_path = make_path(ctx, cvs_path, br)
1853 print ' deleting %s : %s' % (cvs_rev, svn_path)
1854 if cvs_rev != '1.1':
1855 if svn_rev == SVN_INVALID_REVNUM:
1856 svn_rev = dumper.start_revision(props)
1857 # Uh, can this even happen on a deleted path? Hmmm. If not,
1858 # there's no risk, since tags and branches would just be empty
1859 # and therefore enrooting would be a no-op. Still, it would
1860 # be clearer to know for sure and simply not call it.
1861 sym_tracker.enroot_tags(svn_path, svn_rev, tags)
1862 sym_tracker.enroot_branches(svn_path, svn_rev, branches)
1863 ### FIXME: this will return path_deleted == None if no path
1864 ### was deleted. But we'll already have started the revision
1865 ### by then, so it's a bit late to use the knowledge! Need to
1866 ### reorganize things so that starting the revision is a
1867 ### callback with its own internal conditional, so anyone can
1868 ### just invoke when they know they're really about to do
1869 ### something.
1871 ### Right now what happens is we get an empty revision
1872 ### (assuming nothing else happened in this revision).
1873 path_deleted, closed_tags, closed_branches = \
1874 dumper.delete_path(svn_path, tags, branches, ctx.prune)
1875 sym_tracker.close_tags(svn_path, svn_rev, closed_tags)
1876 sym_tracker.close_branches(svn_path, svn_rev, closed_branches)
1878 if svn_rev != SVN_INVALID_REVNUM:
1879 print ' new revision:', svn_rev
1880 else:
1881 print ' no new revision created, as nothing to do'
1884 def read_resync(fname):
1885 "Read the .resync file into memory."
1887 ### note that we assume that we can hold the entire resync file in
1888 ### memory. really large repositories with whacky timestamps could
1889 ### bust this assumption. should that ever happen, then it is possible
1890 ### to split the resync file into pieces and make multiple passes,
1891 ### using each piece.
1894 # A digest maps to a sequence of lists which specify a lower and upper
1895 # time bound for matching up the commit. We keep a sequence of these
1896 # because a number of checkins with the same log message (e.g. an empty
1897 # log message) could need to be remapped. We also make them a list because
1898 # we will dynamically expand the lower/upper bound as we find commits
1899 # that fall into a particular msg and time range.
1901 # resync == digest -> [ [old_time_lower, old_time_upper, new_time], ... ]
1903 resync = { }
1905 for line in fileinput.FileInput(fname):
1906 t1 = int(line[:8], 16)
1907 digest = line[9:DIGEST_END_IDX]
1908 t2 = int(line[DIGEST_END_IDX+1:], 16)
1909 t1_l = t1 - COMMIT_THRESHOLD/2
1910 t1_u = t1 + COMMIT_THRESHOLD/2
1911 if resync.has_key(digest):
1912 resync[digest].append([t1_l, t1_u, t2])
1913 else:
1914 resync[digest] = [ [t1_l, t1_u, t2] ]
1916 # For each digest, sort the resync items in it in increasing order,
1917 # based on the lower time bound.
1918 digests = resync.keys()
1919 for digest in digests:
1920 (resync[digest]).sort()
1922 return resync
1925 def parse_revs_line(line):
1926 data = line.split(' ', 6)
1927 timestamp = int(data[0], 16)
1928 id = data[1]
1929 op = data[2]
1930 rev = data[3]
1931 branch_name = data[4]
1932 if branch_name == "*":
1933 branch_name = None
1934 ntags = int(data[5])
1935 tags = data[6].split(' ', ntags + 1)
1936 nbranches = int(tags[ntags])
1937 branches = tags[ntags + 1].split(' ', nbranches)
1938 fname = branches[nbranches][:-1] # strip \n
1939 tags = tags[:ntags]
1940 branches = branches[:nbranches]
1942 return timestamp, id, op, rev, fname, branch_name, tags, branches
1945 def write_revs_line(output, timestamp, digest, op, revision, fname,
1946 branch_name, tags, branches):
1947 output.write('%08lx %s %s %s ' % (timestamp, digest, op, revision))
1948 if not branch_name:
1949 branch_name = "*"
1950 output.write('%s ' % branch_name)
1951 output.write('%d ' % (len(tags)))
1952 for tag in tags:
1953 output.write('%s ' % (tag))
1954 output.write('%d ' % (len(branches)))
1955 for branch in branches:
1956 output.write('%s ' % (branch))
1957 output.write('%s\n' % fname)
1960 def pass1(ctx):
1961 cd = CollectData(ctx.cvsroot, DATAFILE)
1962 p = rcsparse.Parser()
1963 stats = [ 0 ]
1964 os.path.walk(ctx.cvsroot, visit_file, (cd, p, stats))
1965 if ctx.verbose:
1966 print 'processed', stats[0], 'files'
1969 def pass2(ctx):
1970 "Pass 2: clean up the revision information."
1972 # We may have recorded some changes in revisions' timestamp. We need to
1973 # scan for any other files which may have had the same log message and
1974 # occurred at "the same time" and change their timestamps, too.
1976 # read the resync data file
1977 resync = read_resync(ctx.log_fname_base + RESYNC_SUFFIX)
1979 output = open(ctx.log_fname_base + CLEAN_REVS_SUFFIX, 'w')
1981 # process the revisions file, looking for items to clean up
1982 for line in fileinput.FileInput(ctx.log_fname_base + REVS_SUFFIX):
1983 timestamp, digest, op, rev, fname, branch_name, tags, branches = \
1984 parse_revs_line(line)
1985 if not resync.has_key(digest):
1986 output.write(line)
1987 continue
1989 # we have a hit. see if this is "near" any of the resync records we
1990 # have recorded for this digest [of the log message].
1991 for record in resync[digest]:
1992 if record[0] <= timestamp <= record[1]:
1993 # bingo! remap the time on this (record[2] is the new time).
1994 write_revs_line(output, record[2], digest, op, rev, fname,
1995 branch_name, tags, branches)
1997 print 'RESYNC: %s (%s) : old time="%s" new time="%s"' \
1998 % (relative_name(ctx.cvsroot, fname),
1999 rev, time.ctime(timestamp), time.ctime(record[2]))
2001 # adjust the time range. we want the COMMIT_THRESHOLD from the
2002 # bounds of the earlier/latest commit in this group.
2003 record[0] = min(record[0], timestamp - COMMIT_THRESHOLD/2)
2004 record[1] = max(record[1], timestamp + COMMIT_THRESHOLD/2)
2006 # stop looking for hits
2007 break
2008 else:
2009 # the file/rev did not need to have its time changed.
2010 output.write(line)
2013 def pass3(ctx):
2014 # sort the log files
2015 os.system('sort %s > %s' % (ctx.log_fname_base + CLEAN_REVS_SUFFIX,
2016 ctx.log_fname_base + SORTED_REVS_SUFFIX))
2019 def pass4(ctx):
2020 sym_tracker = SymbolicNameTracker()
2022 # A dictionary of Commit objects, keyed by digest. Each object
2023 # represents one logical commit, which may involve multiple files.
2025 # The reason this is a dictionary, not a single object, is that
2026 # there may be multiple commits interleaved in time. A commit can
2027 # span up to COMMIT_THRESHOLD seconds, which leaves plenty of time
2028 # for parts of some other commit to occur. Since the s-revs file is
2029 # sorted by timestamp first, then by digest within each timestamp,
2030 # it's quite easy to have interleaved commits.
2031 commits = { }
2033 # The total number of separate commits processed. This is used only for
2034 # printing statistics, it does not affect the results in the repository.
2035 count = 0
2037 # Start the dumpfile object.
2038 dumper = Dumper(ctx.dumpfile)
2040 # process the logfiles, creating the target
2041 for line in fileinput.FileInput(ctx.log_fname_base + SORTED_REVS_SUFFIX):
2042 timestamp, id, op, rev, fname, branch_name, tags, branches = \
2043 parse_revs_line(line)
2045 if ctx.trunk_only and not trunk_rev.match(rev):
2046 ### note this could/should have caused a flush, but the next item
2047 ### will take care of that for us
2048 continue
2050 # Each time we read a new line, we scan the commits we've
2051 # accumulated so far to see if any are ready for processing now.
2052 process = [ ]
2053 for scan_id, scan_c in commits.items():
2055 # ### ISSUE: the has_file() check below is not optimal.
2056 # It does fix the dataloss bug where revisions would get lost
2057 # if checked in too quickly, but it can also break apart the
2058 # commits. The correct fix would require tracking the dependencies
2059 # between change sets and committing them in proper order.
2060 if scan_c.t_max + COMMIT_THRESHOLD < timestamp or \
2061 scan_c.has_file(fname):
2062 process.append((scan_c.t_max, scan_c))
2063 del commits[scan_id]
2065 # If there are any elements in 'process' at this point, they need
2066 # to be committed, because this latest rev couldn't possibly be
2067 # part of any of them. Sort them into time-order, then commit 'em.
2068 process.sort()
2069 for t_max, c in process:
2070 c.commit(dumper, ctx, sym_tracker)
2071 count = count + len(process)
2073 # Add this item into the set of still-available commits.
2074 if commits.has_key(id):
2075 c = commits[id]
2076 else:
2077 c = commits[id] = Commit()
2078 c.add(timestamp, op, fname, rev, branch_name, tags, branches)
2080 # End of the sorted revs file. Flush any remaining commits:
2081 if commits:
2082 process = [ ]
2083 for id, c in commits.items():
2084 process.append((c.t_max, c))
2085 process.sort()
2086 for t_max, c in process:
2087 c.commit(dumper, ctx, sym_tracker)
2088 count = count + len(process)
2090 # Create (or complete) any branches and tags not already done.
2091 sym_tracker.finish(dumper, ctx)
2093 dumper.close()
2095 if ctx.verbose:
2096 print count, 'commits processed.'
2099 def pass5(ctx):
2100 # on a dry or dump-only run, there is nothing really to do in pass 5
2101 if ctx.dry_run or ctx.dump_only:
2102 return
2104 # create the target repository is so requested
2105 if ctx.create_repos:
2106 os.system('%s create %s' % (ctx.svnadmin, ctx.target))
2108 # now, load the dumpfile into the repository
2109 print 'loading %s into %s' % (ctx.dumpfile, ctx.target)
2110 os.system('%s load %s < %s'
2111 % (ctx.svnadmin, ctx.target, ctx.dumpfile))
2114 _passes = [
2115 pass1,
2116 pass2,
2117 pass3,
2118 pass4,
2119 pass5,
2123 class _ctx:
2124 pass
2127 def convert(ctx, start_pass=1):
2128 "Convert a CVS repository to an SVN repository."
2130 times = [ None ] * len(_passes)
2131 for i in range(start_pass - 1, len(_passes)):
2132 times[i] = time.time()
2133 if verbose:
2134 print '----- pass %d -----' % (i + 1)
2135 _passes[i](ctx)
2136 times.append(time.time())
2138 if verbose:
2139 for i in range(start_pass, len(_passes)+1):
2140 print 'pass %d: %d seconds' % (i, int(times[i] - times[i-1]))
2141 print ' total:', int(times[len(_passes)] - times[start_pass-1]), 'seconds'
2144 def usage(ctx):
2145 print 'USAGE: %s [-n] [-v] [-s svn-repos-path] [-p pass] cvs-repos-path' \
2146 % os.path.basename(sys.argv[0])
2147 print ' -n dry run; parse CVS repos, but do not construct SVN repos'
2148 print ' -v verbose'
2149 print ' -s PATH path for SVN repos'
2150 print ' -p NUM start at pass NUM of %d' % len(_passes)
2151 print ' --create create a new SVN repository'
2152 print ' --dumpfile=PATH name of intermediate svn dumpfile'
2153 print ' --svnadmin=PATH path to the svnadmin program'
2154 print ' --trunk-only convert only trunk commits, not tags nor branches'
2155 print ' --trunk=PATH path for trunk (default: %s)' \
2156 % ctx.trunk_base
2157 print ' --branches=PATH path for branches (default: %s)' \
2158 % ctx.branches_base
2159 print ' --tags=PATH path for tags (default: %s)' \
2160 % ctx.tags_base
2161 print ' --no-prune don\'t prune empty directories'
2162 print ' --dump-only just produce a dumpfile, don\'t commit to a repos'
2163 print ' --encoding=ENC encoding of log messages in CVS repos (default: %s)' % ctx.encoding
2164 sys.exit(1)
2167 def main():
2168 # prepare the operation context
2169 ctx = _ctx()
2170 ctx.cvsroot = None
2171 ctx.target = None
2172 ctx.log_fname_base = DATAFILE
2173 ctx.dumpfile = DUMPFILE
2174 ctx.verbose = 0
2175 ctx.dry_run = 0
2176 ctx.prune = 1
2177 ctx.create_repos = 0
2178 ctx.dump_only = 0
2179 ctx.trunk_only = 0
2180 ctx.trunk_base = "trunk"
2181 ctx.tags_base = "tags"
2182 ctx.branches_base = "branches"
2183 ctx.encoding = "ascii"
2184 ctx.svnadmin = "svnadmin"
2186 try:
2187 opts, args = getopt.getopt(sys.argv[1:], 'p:s:vn',
2188 [ "create", "trunk=",
2189 "branches=", "tags=", "encoding=",
2190 "trunk-only", "no-prune", "dump-only"])
2191 except getopt.GetoptError:
2192 usage(ctx)
2193 if len(args) != 1:
2194 usage(ctx)
2196 ctx.cvsroot = args[0]
2197 start_pass = 1
2199 for opt, value in opts:
2200 if opt == '-p':
2201 start_pass = int(value)
2202 if start_pass < 1 or start_pass > len(_passes):
2203 print 'ERROR: illegal value (%d) for starting pass. ' \
2204 'must be 1 through %d.' % (start_pass, len(_passes))
2205 sys.exit(1)
2206 elif opt == '-v':
2207 ctx.verbose = 1
2208 elif opt == '-n':
2209 ctx.dry_run = 1
2210 elif opt == '-s':
2211 ctx.target = value
2212 elif opt == '--create':
2213 ctx.create_repos = 1
2214 elif opt == '--dumpfile':
2215 ctx.dumpfile = value
2216 elif opt == '--svnadmin':
2217 ctx.svnadmin = value
2218 elif opt == '--trunk-only':
2219 ctx.trunk_only = 1
2220 elif opt == '--trunk':
2221 ctx.trunk_base = value
2222 elif opt == '--branches':
2223 ctx.branches_base = value
2224 elif opt == '--tags':
2225 ctx.tags_base = value
2226 elif opt == '--no-prune':
2227 ctx.prune = None
2228 elif opt == '--dump-only':
2229 ctx.dump_only = 1
2230 elif opt == '--encoding':
2231 ctx.encoding = value
2233 # Consistency check for options.
2234 if (not ctx.target) and (not ctx.dump_only):
2235 sys.stderr.write("Error: must pass one of '-s' or '--dump-only'.\n")
2236 sys.exit(1)
2238 if ctx.target and ctx.dump_only:
2239 sys.stderr.write("Error: cannot pass both '-s' and '--dump-only'.\n")
2240 sys.exit(1)
2242 if ctx.create_repos and ctx.dump_only:
2243 sys.stderr.write("Error: cannot pass both '--create' and '--dump-only'.\n")
2244 sys.exit(1)
2246 if ((string.find(ctx.trunk_base, '/') > -1)
2247 or (string.find(ctx.tags_base, '/') > -1)
2248 or (string.find(ctx.branches_base, '/') > -1)):
2249 sys.stderr.write("Error: cannot pass multicomponent path to ")
2250 sys.stderr.write("--trunk, --tags, or --branches yet.\n")
2251 sys.stderr.write(" See http://subversion.tigris.org/issues/show_bug.cgi?")
2252 sys.stderr.write("id=1409 ")
2253 sys.stderr.write("for details.\n")
2254 sys.exit(1)
2256 convert(ctx, start_pass=start_pass)
2259 if __name__ == '__main__':
2260 main()