exporter.py

   1 # -*- coding: utf-8 -*-
   2
   3 # Copyright (C) 2008 Canonical Ltd
   4 #
   5 # This program is free software; you can redistribute it and/or modify
   6 # it under the terms of the GNU General Public License as published by
   7 # the Free Software Foundation; either version 2 of the License, or
   8 # (at your option) any later version.
   9 #
  10 # This program is distributed in the hope that it will be useful,
  11 # but WITHOUT ANY WARRANTY; without even the implied warranty of
  12 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  13 # GNU General Public License for more details.
  14 #
  15 # You should have received a copy of the GNU General Public License
  16 # along with this program; if not, write to the Free Software
  17 # Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
  18 #
  19 # Original Copyright (c) 2008 Adeodato Simó
  20 # Original License: MIT (See exporters/bzr-fast-export.LICENSE)
  21 #
  22 # vim: fileencoding=utf-8
  23
  24 """Core engine for the fast-export command."""
  25
  26 # TODO: if a new_git_branch below gets merged repeatedly, the tip of the branch
  27 # is not updated (because the parent of commit is already merged, so we don't
  28 # set new_git_branch to the previously used name)
  29
  30 from email.Utils import parseaddr
  31 import sys, time
  32
  33 import bzrlib.branch
  34 import bzrlib.revision
  35 from bzrlib import (
  36     builtins,
  37     errors as bazErrors,
  38     osutils,
  39     progress,
  40     trace,
  41     )
  42
  43 from bzrlib.plugins.fastimport import (
  44     helpers,
  45     marks_file,
  46     )
  47
  48 from fastimport import commands
  49 from fastimport.helpers import (
  50     binary_stream,
  51     single_plural,
  52     )
  53
  54
  55 def _get_output_stream(destination):
  56     if destination is None or destination == '-':
  57         return binary_stream(sys.stdout)
  58     elif destination.endswith('gz'):
  59         import gzip
  60         return gzip.open(destination, 'wb')
  61     else:
  62         return open(destination, 'wb')
  63
  64
  65 class BzrFastExporter(object):
  66
  67     def __init__(self, source, destination, git_branch=None, checkpoint=-1,
  68         import_marks_file=None, export_marks_file=None, revision=None,
  69         verbose=False, plain_format=False):
  70         """Export branch data in fast import format.
  71
  72         :param plain_format: if True, 'classic' fast-import format is
  73           used without any extended features; if False, the generated
  74           data is richer and includes information like multiple
  75           authors, revision properties, etc.
  76         """
  77         self.source = source
  78         self.outf = _get_output_stream(destination)
  79         self.git_branch = git_branch
  80         self.checkpoint = checkpoint
  81         self.import_marks_file = import_marks_file
  82         self.export_marks_file = export_marks_file
  83         self.revision = revision
  84         self.excluded_revisions = set()
  85         self.plain_format = plain_format
  86         self._multi_author_api_available = hasattr(bzrlib.revision.Revision,
  87             'get_apparent_authors')
  88         self.properties_to_exclude = ['authors', 'author']
  89
  90         # Progress reporting stuff
  91         self.verbose = verbose
  92         if verbose:
  93             self.progress_every = 100
  94         else:
  95             self.progress_every = 1000
  96         self._start_time = time.time()
  97         self._commit_total = 0
  98
  99         # Load the marks and initialise things accordingly
 100         self.revid_to_mark = {}
 101         self.branch_names = {}
 102         if self.import_marks_file:
 103             marks_info = marks_file.import_marks(self.import_marks_file)
 104             if marks_info is not None:
 105                 self.revid_to_mark = dict((r, m) for m, r in
 106                     marks_info.items())
 107                 # These are no longer included in the marks file
 108                 #self.branch_names = marks_info[1]
 109
 110     def interesting_history(self):
 111         if self.revision:
 112             rev1, rev2 = builtins._get_revision_range(self.revision,
 113                 self.branch, "fast-export")
 114             start_rev_id = rev1.rev_id
 115             end_rev_id = rev2.rev_id
 116         else:
 117             start_rev_id = None
 118             end_rev_id = None
 119         self.note("Calculating the revisions to include ...")
 120         view_revisions = reversed([rev_id for rev_id, _, _, _ in
 121             self.branch.iter_merge_sorted_revisions(end_rev_id, start_rev_id)])
 122         # If a starting point was given, we need to later check that we don't
 123         # start emitting revisions from before that point. Collect the
 124         # revisions to exclude now ...
 125         if start_rev_id is not None:
 126             self.note("Calculating the revisions to exclude ...")
 127             self.excluded_revisions = set([rev_id for rev_id, _, _, _ in
 128                 self.branch.iter_merge_sorted_revisions(start_rev_id)])
 129         return list(view_revisions)
 130
 131     def run(self):
 132         # Open the source
 133         self.branch = bzrlib.branch.Branch.open_containing(self.source)[0]
 134
 135         # Export the data
 136         self.branch.repository.lock_read()
 137         try:
 138             interesting = self.interesting_history()
 139             self._commit_total = len(interesting)
 140             self.note("Starting export of %d revisions ..." %
 141                 self._commit_total)
 142             if not self.plain_format:
 143                 self.emit_features()
 144             for revid in interesting:
 145                 self.emit_commit(revid, self.git_branch)
 146             if self.branch.supports_tags():
 147                 self.emit_tags()
 148         finally:
 149             self.branch.repository.unlock()
 150
 151         # Save the marks if requested
 152         self._save_marks()
 153         self.dump_stats()
 154
 155     def note(self, msg, *args):
 156         """Output a note but timestamp it."""
 157         msg = "%s %s" % (self._time_of_day(), msg)
 158         trace.note(msg, *args)
 159
 160     def warning(self, msg, *args):
 161         """Output a warning but timestamp it."""
 162         msg = "%s WARNING: %s" % (self._time_of_day(), msg)
 163         trace.warning(msg, *args)
 164
 165     def _time_of_day(self):
 166         """Time of day as a string."""
 167         # Note: this is a separate method so tests can patch in a fixed value
 168         return time.strftime("%H:%M:%S")
 169
 170     def report_progress(self, commit_count, details=''):
 171         if commit_count and commit_count % self.progress_every == 0:
 172             if self._commit_total:
 173                 counts = "%d/%d" % (commit_count, self._commit_total)
 174             else:
 175                 counts = "%d" % (commit_count,)
 176             minutes = (time.time() - self._start_time) / 60
 177             rate = commit_count * 1.0 / minutes
 178             if rate > 10:
 179                 rate_str = "at %.0f/minute " % rate
 180             else:
 181                 rate_str = "at %.1f/minute " % rate
 182             self.note("%s commits exported %s%s" % (counts, rate_str, details))
 183
 184     def dump_stats(self):
 185         time_required = progress.str_tdelta(time.time() - self._start_time)
 186         rc = len(self.revid_to_mark)
 187         self.note("Exported %d %s in %s",
 188             rc, single_plural(rc, "revision", "revisions"),
 189             time_required)
 190
 191     def print_cmd(self, cmd):
 192         self.outf.write("%r\n" % cmd)
 193
 194     def _save_marks(self):
 195         if self.export_marks_file:
 196             revision_ids = dict((m, r) for r, m in self.revid_to_mark.items())
 197             marks_file.export_marks(self.export_marks_file, revision_ids)
 198
 199     def is_empty_dir(self, tree, path):
 200         path_id = tree.path2id(path)
 201         if path_id is None:
 202             self.warning("Skipping empty_dir detection - no file_id for %s" %
 203                 (path,))
 204             return False
 205
 206         # Continue if path is not a directory
 207         if tree.kind(path_id) != 'directory':
 208             return False
 209
 210         # Use treewalk to find the contents of our directory
 211         contents = list(tree.walkdirs(prefix=path))[0]
 212         if len(contents[1]) == 0:
 213             return True
 214         else:
 215             return False
 216
 217     def emit_features(self):
 218         for feature in sorted(commands.FEATURE_NAMES):
 219             self.print_cmd(commands.FeatureCommand(feature))
 220
 221     def emit_commit(self, revid, git_branch):
 222         if revid in self.revid_to_mark or revid in self.excluded_revisions:
 223             return
 224
 225         # Get the Revision object
 226         try:
 227             revobj = self.branch.repository.get_revision(revid)
 228         except bazErrors.NoSuchRevision:
 229             # This is a ghost revision. Mark it as not found and next!
 230             self.revid_to_mark[revid] = -1
 231             return
 232
 233         # Get the primary parent
 234         # TODO: Consider the excluded revisions when deciding the parents.
 235         # Currently, a commit with parents that are excluded ought to be
 236         # triggering the git_branch calculation below (and it is not).
 237         # IGC 20090824
 238         ncommits = len(self.revid_to_mark)
 239         nparents = len(revobj.parent_ids)
 240         if nparents == 0:
 241             if ncommits:
 242                 # This is a parentless commit but it's not the first one
 243                 # output. We need to create a new temporary branch for it
 244                 # otherwise git-fast-import will assume the previous commit
 245                 # was this one's parent
 246                 git_branch = self._next_tmp_branch_name()
 247             parent = bzrlib.revision.NULL_REVISION
 248         else:
 249             parent = revobj.parent_ids[0]
 250
 251         # Print the commit
 252         git_ref = 'refs/heads/%s' % (git_branch,)
 253         mark = ncommits + 1
 254         self.revid_to_mark[revid] = mark
 255         file_cmds = self._get_filecommands(parent, revid)
 256         self.print_cmd(self._get_commit_command(git_ref, mark, revobj,
 257             file_cmds))
 258
 259         # Report progress and checkpoint if it's time for that
 260         self.report_progress(ncommits)
 261         if (self.checkpoint > 0 and ncommits
 262             and ncommits % self.checkpoint == 0):
 263             self.note("Exported %i commits - adding checkpoint to output"
 264                 % ncommits)
 265             self._save_marks()
 266             self.print_cmd(commands.CheckpointCommand())
 267
 268     def _get_name_email(self, user):
 269         if user.find('<') == -1:
 270             # If the email isn't inside <>, we need to use it as the name
 271             # in order for things to round-trip correctly.
 272             # (note: parseaddr('a@b.com') => name:'', email: 'a@b.com')
 273             name = user
 274             email = ''
 275         else:
 276             name, email = parseaddr(user)
 277         return name, email
 278
 279     def _get_commit_command(self, git_ref, mark, revobj, file_cmds):
 280         # Get the committer and author info
 281         committer = revobj.committer
 282         name, email = self._get_name_email(committer)
 283         committer_info = (name, email, revobj.timestamp, revobj.timezone)
 284         if self._multi_author_api_available:
 285             more_authors = revobj.get_apparent_authors()
 286             author = more_authors.pop(0)
 287         else:
 288             more_authors = []
 289             author = revobj.get_apparent_author()
 290         if not self.plain_format and more_authors:
 291             name, email = self._get_name_email(author)
 292             author_info = (name, email, revobj.timestamp, revobj.timezone)
 293             more_author_info = []
 294             for a in more_authors:
 295                 name, email = self._get_name_email(a)
 296                 more_author_info.append(
 297                     (name, email, revobj.timestamp, revobj.timezone))
 298         elif author != committer:
 299             name, email = self._get_name_email(author)
 300             author_info = (name, email, revobj.timestamp, revobj.timezone)
 301             more_author_info = None
 302         else:
 303             author_info = None
 304             more_author_info = None
 305
 306         # Get the parents in terms of marks
 307         non_ghost_parents = []
 308         for p in revobj.parent_ids:
 309             if p in self.excluded_revisions:
 310                 continue
 311             try:
 312                 parent_mark = self.revid_to_mark[p]
 313                 non_ghost_parents.append(":%s" % parent_mark)
 314             except KeyError:
 315                 # ghost - ignore
 316                 continue
 317         if non_ghost_parents:
 318             from_ = non_ghost_parents[0]
 319             merges = non_ghost_parents[1:]
 320         else:
 321             from_ = None
 322             merges = None
 323
 324         # Filter the revision properties. Some metadata (like the
 325         # author information) is already exposed in other ways so
 326         # don't repeat it here.
 327         if self.plain_format:
 328             properties = None
 329         else:
 330             properties = revobj.properties
 331             for prop in self.properties_to_exclude:
 332                 try:
 333                     del properties[prop]
 334                 except KeyError:
 335                     pass
 336
 337         # Build and return the result
 338         return commands.CommitCommand(git_ref, mark, author_info,
 339             committer_info, revobj.message, from_, merges, iter(file_cmds),
 340             more_authors=more_author_info, properties=properties)
 341
 342     def _get_revision_trees(self, parent, revision_id):
 343         try:
 344             tree_old = self.branch.repository.revision_tree(parent)
 345         except bazErrors.UnexpectedInventoryFormat:
 346             self.warning("Parent is malformed - diffing against previous parent")
 347             # We can't find the old parent. Let's diff against his parent
 348             pp = self.branch.repository.get_revision(parent)
 349             tree_old = self.branch.repository.revision_tree(pp.parent_ids[0])
 350         tree_new = None
 351         try:
 352             tree_new = self.branch.repository.revision_tree(revision_id)
 353         except bazErrors.UnexpectedInventoryFormat:
 354             # We can't really do anything anymore
 355             self.warning("Revision %s is malformed - skipping" % revision_id)
 356         return tree_old, tree_new
 357
 358     def _get_filecommands(self, parent, revision_id):
 359         """Get the list of FileCommands for the changes between two revisions."""
 360         tree_old, tree_new = self._get_revision_trees(parent, revision_id)
 361         if not(tree_old and tree_new):
 362             # Something is wrong with this revision - ignore the filecommands
 363             return []
 364
 365         changes = tree_new.changes_from(tree_old)
 366
 367         # Make "modified" have 3-tuples, as added does
 368         my_modified = [ x[0:3] for x in changes.modified ]
 369
 370         # The potential interaction between renames and deletes is messy.
 371         # Handle it here ...
 372         file_cmds, rd_modifies, renamed = self._process_renames_and_deletes(
 373             changes.renamed, changes.removed, revision_id, tree_old)
 374
 375         # Map kind changes to a delete followed by an add
 376         for path, id_, kind1, kind2 in changes.kind_changed:
 377             path = self._adjust_path_for_renames(path, renamed, revision_id)
 378             # IGC: I don't understand why a delete is needed here.
 379             # In fact, it seems harmful? If you uncomment this line,
 380             # please file a bug explaining why you needed to.
 381             #file_cmds.append(commands.FileDeleteCommand(path))
 382             my_modified.append((path, id_, kind2))
 383
 384         # Record modifications
 385         for path, id_, kind in changes.added + my_modified + rd_modifies:
 386             if kind == 'file':
 387                 text = tree_new.get_file_text(id_)
 388                 file_cmds.append(commands.FileModifyCommand(path,
 389                     helpers.kind_to_mode('file', tree_new.is_executable(id_)),
 390                     None, text))
 391             elif kind == 'symlink':
 392                 file_cmds.append(commands.FileModifyCommand(path,
 393                     helpers.kind_to_mode('symlink', False),
 394                     None, tree_new.get_symlink_target(id_)))
 395             elif kind == 'directory':
 396                 if not self.plain_format:
 397                     file_cmds.append(commands.FileModifyCommand(path,
 398                         helpers.kind_to_mode('directory', False),
 399                         None, None))
 400             else:
 401                 self.warning("cannot export '%s' of kind %s yet - ignoring" %
 402                     (path, kind))
 403         return file_cmds
 404
 405     def _process_renames_and_deletes(self, renames, deletes,
 406         revision_id, tree_old):
 407         file_cmds = []
 408         modifies = []
 409         renamed = []
 410
 411         # See https://bugs.edge.launchpad.net/bzr-fastimport/+bug/268933.
 412         # In a nutshell, there are several nasty cases:
 413         #
 414         # 1) bzr rm a; bzr mv b a; bzr commit
 415         # 2) bzr mv x/y z; bzr rm x; commmit
 416         #
 417         # The first must come out with the delete first like this:
 418         #
 419         # D a
 420         # R b a
 421         #
 422         # The second case must come out with the rename first like this:
 423         #
 424         # R x/y z
 425         # D x
 426         #
 427         # So outputting all deletes first or all renames first won't work.
 428         # Instead, we need to make multiple passes over the various lists to
 429         # get the ordering right.
 430
 431         must_be_renamed = {}
 432         old_to_new = {}
 433         deleted_paths = set([p for p, _, _ in deletes])
 434         for (oldpath, newpath, id_, kind,
 435                 text_modified, meta_modified) in renames:
 436             emit = kind != 'directory' or not self.plain_format
 437             if newpath in deleted_paths:
 438                 if emit:
 439                     file_cmds.append(commands.FileDeleteCommand(newpath))
 440                 deleted_paths.remove(newpath)
 441             if (self.is_empty_dir(tree_old, oldpath)):
 442                 self.note("Skipping empty dir %s in rev %s" % (oldpath,
 443                     revision_id))
 444                 continue
 445             #oldpath = self._adjust_path_for_renames(oldpath, renamed,
 446             #    revision_id)
 447             renamed.append([oldpath, newpath])
 448             old_to_new[oldpath] = newpath
 449             if emit:
 450                 file_cmds.append(commands.FileRenameCommand(oldpath, newpath))
 451             if text_modified or meta_modified:
 452                 modifies.append((newpath, id_, kind))
 453
 454             # Renaming a directory implies all children must be renamed.
 455             # Note: changes_from() doesn't handle this
 456             if kind == 'directory':
 457                 for p, e in tree_old.inventory.iter_entries_by_dir(from_dir=id_):
 458                     if e.kind == 'directory' and self.plain_format:
 459                         continue
 460                     old_child_path = osutils.pathjoin(oldpath, p)
 461                     new_child_path = osutils.pathjoin(newpath, p)
 462                     must_be_renamed[old_child_path] = new_child_path
 463
 464         # Add children not already renamed
 465         if must_be_renamed:
 466             renamed_already = set(old_to_new.keys())
 467             still_to_be_renamed = set(must_be_renamed.keys()) - renamed_already
 468             for old_child_path in sorted(still_to_be_renamed):
 469                 new_child_path = must_be_renamed[old_child_path]
 470                 if self.verbose:
 471                     self.note("implicitly renaming %s => %s" % (old_child_path,
 472                         new_child_path))
 473                 file_cmds.append(commands.FileRenameCommand(old_child_path,
 474                     new_child_path))
 475
 476         # Record remaining deletes
 477         for path, id_, kind in deletes:
 478             if path not in deleted_paths:
 479                 continue
 480             if kind == 'directory' and self.plain_format:
 481                 continue
 482             #path = self._adjust_path_for_renames(path, renamed, revision_id)
 483             file_cmds.append(commands.FileDeleteCommand(path))
 484         return file_cmds, modifies, renamed
 485
 486     def _adjust_path_for_renames(self, path, renamed, revision_id):
 487         # If a previous rename is found, we should adjust the path
 488         for old, new in renamed:
 489             if path == old:
 490                 self.note("Changing path %s given rename to %s in revision %s"
 491                     % (path, new, revision_id))
 492                 path = new
 493             elif path.startswith(old + '/'):
 494                 self.note(
 495                     "Adjusting path %s given rename of %s to %s in revision %s"
 496                     % (path, old, new, revision_id))
 497                 path = path.replace(old + "/", new + "/")
 498         return path
 499
 500     def emit_tags(self):
 501         for tag, revid in self.branch.tags.get_tag_dict().items():
 502             try:
 503                 mark = self.revid_to_mark[revid]
 504             except KeyError:
 505                 self.warning('not creating tag %r pointing to non-existent '
 506                     'revision %s' % (tag, revid))
 507             else:
 508                 git_ref = 'refs/tags/%s' % tag.encode("utf-8")
 509                 self.print_cmd(commands.ResetCommand(git_ref, ":" + str(mark)))
 510
 511     def _next_tmp_branch_name(self):
 512         """Return a unique branch name. The name will start with "tmp"."""
 513         prefix = 'tmp'
 514         if prefix not in self.branch_names:
 515             self.branch_names[prefix] = 0
 516         else:
 517             self.branch_names[prefix] += 1
 518             prefix = '%s.%d' % (prefix, self.branch_names[prefix])
 519         return prefix