Release 0.11.0.
[bzr-fastimport.git] / exporter.py
blob0a6db67cd2a2294ce292f823627076825ef38f18
1 # -*- coding: utf-8 -*-
3 # Copyright (C) 2008 Canonical Ltd
5 # This program is free software; you can redistribute it and/or modify
6 # it under the terms of the GNU General Public License as published by
7 # the Free Software Foundation; either version 2 of the License, or
8 # (at your option) any later version.
10 # This program is distributed in the hope that it will be useful,
11 # but WITHOUT ANY WARRANTY; without even the implied warranty of
12 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 # GNU General Public License for more details.
15 # You should have received a copy of the GNU General Public License
16 # along with this program; if not, write to the Free Software
17 # Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
19 # Original Copyright (c) 2008 Adeodato Simó
20 # Original License: MIT (See exporters/bzr-fast-export.LICENSE)
22 # vim: fileencoding=utf-8
24 """Core engine for the fast-export command."""
26 # TODO: if a new_git_branch below gets merged repeatedly, the tip of the branch
27 # is not updated (because the parent of commit is already merged, so we don't
28 # set new_git_branch to the previously used name)
30 from email.Utils import parseaddr
31 import sys, time
33 import bzrlib.branch
34 import bzrlib.revision
35 from bzrlib import (
36 builtins,
37 errors as bazErrors,
38 osutils,
39 progress,
40 trace,
43 from bzrlib.plugins.fastimport import (
44 helpers,
45 marks_file,
48 from fastimport import commands
49 from fastimport.helpers import (
50 binary_stream,
51 single_plural,
55 def _get_output_stream(destination):
56 if destination is None or destination == '-':
57 return binary_stream(sys.stdout)
58 elif destination.endswith('gz'):
59 import gzip
60 return gzip.open(destination, 'wb')
61 else:
62 return open(destination, 'wb')
64 # from dulwich.repo:
65 def check_ref_format(refname):
66 """Check if a refname is correctly formatted.
68 Implements all the same rules as git-check-ref-format[1].
70 [1] http://www.kernel.org/pub/software/scm/git/docs/git-check-ref-format.html
72 :param refname: The refname to check
73 :return: True if refname is valid, False otherwise
74 """
75 # These could be combined into one big expression, but are listed separately
76 # to parallel [1].
77 if '/.' in refname or refname.startswith('.'):
78 return False
79 if '/' not in refname:
80 return False
81 if '..' in refname:
82 return False
83 for c in refname:
84 if ord(c) < 040 or c in '\177 ~^:?*[':
85 return False
86 if refname[-1] in '/.':
87 return False
88 if refname.endswith('.lock'):
89 return False
90 if '@{' in refname:
91 return False
92 if '\\' in refname:
93 return False
94 return True
98 class BzrFastExporter(object):
100 def __init__(self, source, destination, git_branch=None, checkpoint=-1,
101 import_marks_file=None, export_marks_file=None, revision=None,
102 verbose=False, plain_format=False):
103 """Export branch data in fast import format.
105 :param plain_format: if True, 'classic' fast-import format is
106 used without any extended features; if False, the generated
107 data is richer and includes information like multiple
108 authors, revision properties, etc.
110 self.source = source
111 self.outf = _get_output_stream(destination)
112 self.git_branch = git_branch
113 self.checkpoint = checkpoint
114 self.import_marks_file = import_marks_file
115 self.export_marks_file = export_marks_file
116 self.revision = revision
117 self.excluded_revisions = set()
118 self.plain_format = plain_format
119 self._multi_author_api_available = hasattr(bzrlib.revision.Revision,
120 'get_apparent_authors')
121 self.properties_to_exclude = ['authors', 'author']
123 # Progress reporting stuff
124 self.verbose = verbose
125 if verbose:
126 self.progress_every = 100
127 else:
128 self.progress_every = 1000
129 self._start_time = time.time()
130 self._commit_total = 0
132 # Load the marks and initialise things accordingly
133 self.revid_to_mark = {}
134 self.branch_names = {}
135 if self.import_marks_file:
136 marks_info = marks_file.import_marks(self.import_marks_file)
137 if marks_info is not None:
138 self.revid_to_mark = dict((r, m) for m, r in
139 marks_info.items())
140 # These are no longer included in the marks file
141 #self.branch_names = marks_info[1]
143 def interesting_history(self):
144 if self.revision:
145 rev1, rev2 = builtins._get_revision_range(self.revision,
146 self.branch, "fast-export")
147 start_rev_id = rev1.rev_id
148 end_rev_id = rev2.rev_id
149 else:
150 start_rev_id = None
151 end_rev_id = None
152 self.note("Calculating the revisions to include ...")
153 view_revisions = reversed([rev_id for rev_id, _, _, _ in
154 self.branch.iter_merge_sorted_revisions(end_rev_id, start_rev_id)])
155 # If a starting point was given, we need to later check that we don't
156 # start emitting revisions from before that point. Collect the
157 # revisions to exclude now ...
158 if start_rev_id is not None:
159 self.note("Calculating the revisions to exclude ...")
160 self.excluded_revisions = set([rev_id for rev_id, _, _, _ in
161 self.branch.iter_merge_sorted_revisions(start_rev_id)])
162 return list(view_revisions)
164 def run(self):
165 # Open the source
166 self.branch = bzrlib.branch.Branch.open_containing(self.source)[0]
168 # Export the data
169 self.branch.repository.lock_read()
170 try:
171 interesting = self.interesting_history()
172 self._commit_total = len(interesting)
173 self.note("Starting export of %d revisions ..." %
174 self._commit_total)
175 if not self.plain_format:
176 self.emit_features()
177 for revid in interesting:
178 self.emit_commit(revid, self.git_branch)
179 if self.branch.supports_tags():
180 self.emit_tags()
181 finally:
182 self.branch.repository.unlock()
184 # Save the marks if requested
185 self._save_marks()
186 self.dump_stats()
188 def note(self, msg, *args):
189 """Output a note but timestamp it."""
190 msg = "%s %s" % (self._time_of_day(), msg)
191 trace.note(msg, *args)
193 def warning(self, msg, *args):
194 """Output a warning but timestamp it."""
195 msg = "%s WARNING: %s" % (self._time_of_day(), msg)
196 trace.warning(msg, *args)
198 def _time_of_day(self):
199 """Time of day as a string."""
200 # Note: this is a separate method so tests can patch in a fixed value
201 return time.strftime("%H:%M:%S")
203 def report_progress(self, commit_count, details=''):
204 if commit_count and commit_count % self.progress_every == 0:
205 if self._commit_total:
206 counts = "%d/%d" % (commit_count, self._commit_total)
207 else:
208 counts = "%d" % (commit_count,)
209 minutes = (time.time() - self._start_time) / 60
210 rate = commit_count * 1.0 / minutes
211 if rate > 10:
212 rate_str = "at %.0f/minute " % rate
213 else:
214 rate_str = "at %.1f/minute " % rate
215 self.note("%s commits exported %s%s" % (counts, rate_str, details))
217 def dump_stats(self):
218 time_required = progress.str_tdelta(time.time() - self._start_time)
219 rc = len(self.revid_to_mark)
220 self.note("Exported %d %s in %s",
221 rc, single_plural(rc, "revision", "revisions"),
222 time_required)
224 def print_cmd(self, cmd):
225 self.outf.write("%r\n" % cmd)
227 def _save_marks(self):
228 if self.export_marks_file:
229 revision_ids = dict((m, r) for r, m in self.revid_to_mark.items())
230 marks_file.export_marks(self.export_marks_file, revision_ids)
232 def is_empty_dir(self, tree, path):
233 path_id = tree.path2id(path)
234 if path_id is None:
235 self.warning("Skipping empty_dir detection - no file_id for %s" %
236 (path,))
237 return False
239 # Continue if path is not a directory
240 if tree.kind(path_id) != 'directory':
241 return False
243 # Use treewalk to find the contents of our directory
244 contents = list(tree.walkdirs(prefix=path))[0]
245 if len(contents[1]) == 0:
246 return True
247 else:
248 return False
250 def emit_features(self):
251 for feature in sorted(commands.FEATURE_NAMES):
252 self.print_cmd(commands.FeatureCommand(feature))
254 def emit_commit(self, revid, git_branch):
255 if revid in self.revid_to_mark or revid in self.excluded_revisions:
256 return
258 # Get the Revision object
259 try:
260 revobj = self.branch.repository.get_revision(revid)
261 except bazErrors.NoSuchRevision:
262 # This is a ghost revision. Mark it as not found and next!
263 self.revid_to_mark[revid] = -1
264 return
266 # Get the primary parent
267 # TODO: Consider the excluded revisions when deciding the parents.
268 # Currently, a commit with parents that are excluded ought to be
269 # triggering the git_branch calculation below (and it is not).
270 # IGC 20090824
271 ncommits = len(self.revid_to_mark)
272 nparents = len(revobj.parent_ids)
273 if nparents == 0:
274 if ncommits:
275 # This is a parentless commit but it's not the first one
276 # output. We need to create a new temporary branch for it
277 # otherwise git-fast-import will assume the previous commit
278 # was this one's parent
279 git_branch = self._next_tmp_branch_name()
280 parent = bzrlib.revision.NULL_REVISION
281 else:
282 parent = revobj.parent_ids[0]
284 # Print the commit
285 git_ref = 'refs/heads/%s' % (git_branch,)
286 mark = ncommits + 1
287 self.revid_to_mark[revid] = mark
288 file_cmds = self._get_filecommands(parent, revid)
289 self.print_cmd(self._get_commit_command(git_ref, mark, revobj,
290 file_cmds))
292 # Report progress and checkpoint if it's time for that
293 self.report_progress(ncommits)
294 if (self.checkpoint > 0 and ncommits
295 and ncommits % self.checkpoint == 0):
296 self.note("Exported %i commits - adding checkpoint to output"
297 % ncommits)
298 self._save_marks()
299 self.print_cmd(commands.CheckpointCommand())
301 def _get_name_email(self, user):
302 if user.find('<') == -1:
303 # If the email isn't inside <>, we need to use it as the name
304 # in order for things to round-trip correctly.
305 # (note: parseaddr('a@b.com') => name:'', email: 'a@b.com')
306 name = user
307 email = ''
308 else:
309 name, email = parseaddr(user)
310 return name.encode("utf-8"), email.encode("utf-8")
312 def _get_commit_command(self, git_ref, mark, revobj, file_cmds):
313 # Get the committer and author info
314 committer = revobj.committer
315 name, email = self._get_name_email(committer)
316 committer_info = (name, email, revobj.timestamp, revobj.timezone)
317 if self._multi_author_api_available:
318 more_authors = revobj.get_apparent_authors()
319 author = more_authors.pop(0)
320 else:
321 more_authors = []
322 author = revobj.get_apparent_author()
323 if not self.plain_format and more_authors:
324 name, email = self._get_name_email(author)
325 author_info = (name, email, revobj.timestamp, revobj.timezone)
326 more_author_info = []
327 for a in more_authors:
328 name, email = self._get_name_email(a)
329 more_author_info.append(
330 (name, email, revobj.timestamp, revobj.timezone))
331 elif author != committer:
332 name, email = self._get_name_email(author)
333 author_info = (name, email, revobj.timestamp, revobj.timezone)
334 more_author_info = None
335 else:
336 author_info = None
337 more_author_info = None
339 # Get the parents in terms of marks
340 non_ghost_parents = []
341 for p in revobj.parent_ids:
342 if p in self.excluded_revisions:
343 continue
344 try:
345 parent_mark = self.revid_to_mark[p]
346 non_ghost_parents.append(":%s" % parent_mark)
347 except KeyError:
348 # ghost - ignore
349 continue
350 if non_ghost_parents:
351 from_ = non_ghost_parents[0]
352 merges = non_ghost_parents[1:]
353 else:
354 from_ = None
355 merges = None
357 # Filter the revision properties. Some metadata (like the
358 # author information) is already exposed in other ways so
359 # don't repeat it here.
360 if self.plain_format:
361 properties = None
362 else:
363 properties = revobj.properties
364 for prop in self.properties_to_exclude:
365 try:
366 del properties[prop]
367 except KeyError:
368 pass
370 # Build and return the result
371 return commands.CommitCommand(git_ref, mark, author_info,
372 committer_info, revobj.message.encode("utf-8"), from_, merges, iter(file_cmds),
373 more_authors=more_author_info, properties=properties)
375 def _get_revision_trees(self, parent, revision_id):
376 try:
377 tree_old = self.branch.repository.revision_tree(parent)
378 except bazErrors.UnexpectedInventoryFormat:
379 self.warning("Parent is malformed - diffing against previous parent")
380 # We can't find the old parent. Let's diff against his parent
381 pp = self.branch.repository.get_revision(parent)
382 tree_old = self.branch.repository.revision_tree(pp.parent_ids[0])
383 tree_new = None
384 try:
385 tree_new = self.branch.repository.revision_tree(revision_id)
386 except bazErrors.UnexpectedInventoryFormat:
387 # We can't really do anything anymore
388 self.warning("Revision %s is malformed - skipping" % revision_id)
389 return tree_old, tree_new
391 def _get_filecommands(self, parent, revision_id):
392 """Get the list of FileCommands for the changes between two revisions."""
393 tree_old, tree_new = self._get_revision_trees(parent, revision_id)
394 if not(tree_old and tree_new):
395 # Something is wrong with this revision - ignore the filecommands
396 return []
398 changes = tree_new.changes_from(tree_old)
400 # Make "modified" have 3-tuples, as added does
401 my_modified = [ x[0:3] for x in changes.modified ]
403 # The potential interaction between renames and deletes is messy.
404 # Handle it here ...
405 file_cmds, rd_modifies, renamed = self._process_renames_and_deletes(
406 changes.renamed, changes.removed, revision_id, tree_old)
408 # Map kind changes to a delete followed by an add
409 for path, id_, kind1, kind2 in changes.kind_changed:
410 path = self._adjust_path_for_renames(path, renamed, revision_id)
411 # IGC: I don't understand why a delete is needed here.
412 # In fact, it seems harmful? If you uncomment this line,
413 # please file a bug explaining why you needed to.
414 #file_cmds.append(commands.FileDeleteCommand(path))
415 my_modified.append((path, id_, kind2))
417 # Record modifications
418 for path, id_, kind in changes.added + my_modified + rd_modifies:
419 if kind == 'file':
420 text = tree_new.get_file_text(id_)
421 file_cmds.append(commands.FileModifyCommand(path.encode("utf-8"),
422 helpers.kind_to_mode('file', tree_new.is_executable(id_)),
423 None, text))
424 elif kind == 'symlink':
425 file_cmds.append(commands.FileModifyCommand(path.encode("utf-8"),
426 helpers.kind_to_mode('symlink', False),
427 None, tree_new.get_symlink_target(id_)))
428 elif kind == 'directory':
429 if not self.plain_format:
430 file_cmds.append(commands.FileModifyCommand(path.encode("utf-8"),
431 helpers.kind_to_mode('directory', False),
432 None, None))
433 else:
434 self.warning("cannot export '%s' of kind %s yet - ignoring" %
435 (path, kind))
436 return file_cmds
438 def _process_renames_and_deletes(self, renames, deletes,
439 revision_id, tree_old):
440 file_cmds = []
441 modifies = []
442 renamed = []
444 # See https://bugs.edge.launchpad.net/bzr-fastimport/+bug/268933.
445 # In a nutshell, there are several nasty cases:
447 # 1) bzr rm a; bzr mv b a; bzr commit
448 # 2) bzr mv x/y z; bzr rm x; commmit
450 # The first must come out with the delete first like this:
452 # D a
453 # R b a
455 # The second case must come out with the rename first like this:
457 # R x/y z
458 # D x
460 # So outputting all deletes first or all renames first won't work.
461 # Instead, we need to make multiple passes over the various lists to
462 # get the ordering right.
464 must_be_renamed = {}
465 old_to_new = {}
466 deleted_paths = set([p for p, _, _ in deletes])
467 for (oldpath, newpath, id_, kind,
468 text_modified, meta_modified) in renames:
469 emit = kind != 'directory' or not self.plain_format
470 if newpath in deleted_paths:
471 if emit:
472 file_cmds.append(commands.FileDeleteCommand(newpath.encode("utf-8")))
473 deleted_paths.remove(newpath)
474 if (self.is_empty_dir(tree_old, oldpath)):
475 self.note("Skipping empty dir %s in rev %s" % (oldpath,
476 revision_id))
477 continue
478 #oldpath = self._adjust_path_for_renames(oldpath, renamed,
479 # revision_id)
480 renamed.append([oldpath, newpath])
481 old_to_new[oldpath] = newpath
482 if emit:
483 file_cmds.append(
484 commands.FileRenameCommand(oldpath.encode("utf-8"), newpath.encode("utf-8")))
485 if text_modified or meta_modified:
486 modifies.append((newpath, id_, kind))
488 # Renaming a directory implies all children must be renamed.
489 # Note: changes_from() doesn't handle this
490 if kind == 'directory' and tree_old.kind(id_) == 'directory':
491 for p, e in tree_old.inventory.iter_entries_by_dir(from_dir=id_):
492 if e.kind == 'directory' and self.plain_format:
493 continue
494 old_child_path = osutils.pathjoin(oldpath, p)
495 new_child_path = osutils.pathjoin(newpath, p)
496 must_be_renamed[old_child_path] = new_child_path
498 # Add children not already renamed
499 if must_be_renamed:
500 renamed_already = set(old_to_new.keys())
501 still_to_be_renamed = set(must_be_renamed.keys()) - renamed_already
502 for old_child_path in sorted(still_to_be_renamed):
503 new_child_path = must_be_renamed[old_child_path]
504 if self.verbose:
505 self.note("implicitly renaming %s => %s" % (old_child_path,
506 new_child_path))
507 file_cmds.append(commands.FileRenameCommand(old_child_path.encode("utf-8"),
508 new_child_path.encode("utf-8")))
510 # Record remaining deletes
511 for path, id_, kind in deletes:
512 if path not in deleted_paths:
513 continue
514 if kind == 'directory' and self.plain_format:
515 continue
516 #path = self._adjust_path_for_renames(path, renamed, revision_id)
517 file_cmds.append(commands.FileDeleteCommand(path.encode("utf-8")))
518 return file_cmds, modifies, renamed
520 def _adjust_path_for_renames(self, path, renamed, revision_id):
521 # If a previous rename is found, we should adjust the path
522 for old, new in renamed:
523 if path == old:
524 self.note("Changing path %s given rename to %s in revision %s"
525 % (path, new, revision_id))
526 path = new
527 elif path.startswith(old + '/'):
528 self.note(
529 "Adjusting path %s given rename of %s to %s in revision %s"
530 % (path, old, new, revision_id))
531 path = path.replace(old + "/", new + "/")
532 return path
534 def emit_tags(self):
535 for tag, revid in self.branch.tags.get_tag_dict().items():
536 try:
537 mark = self.revid_to_mark[revid]
538 except KeyError:
539 self.warning('not creating tag %r pointing to non-existent '
540 'revision %s' % (tag, revid))
541 else:
542 git_ref = 'refs/tags/%s' % tag.encode("utf-8")
543 if self.plain_format and not check_ref_format(git_ref):
544 self.warning('not creating tag %r as its name would not be '
545 'valid in git.', git_ref)
546 continue
547 self.print_cmd(commands.ResetCommand(git_ref, ":" + str(mark)))
549 def _next_tmp_branch_name(self):
550 """Return a unique branch name. The name will start with "tmp"."""
551 prefix = 'tmp'
552 if prefix not in self.branch_names:
553 self.branch_names[prefix] = 0
554 else:
555 self.branch_names[prefix] += 1
556 prefix = '%s.%d' % (prefix, self.branch_names[prefix])
557 return prefix