bzr_exporter.py

   1 # -*- coding: utf-8 -*-
   2
   3 # Copyright (C) 2008 Canonical Ltd
   4 #
   5 # This program is free software; you can redistribute it and/or modify
   6 # it under the terms of the GNU General Public License as published by
   7 # the Free Software Foundation; either version 2 of the License, or
   8 # (at your option) any later version.
   9 #
  10 # This program is distributed in the hope that it will be useful,
  11 # but WITHOUT ANY WARRANTY; without even the implied warranty of
  12 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  13 # GNU General Public License for more details.
  14 #
  15 # You should have received a copy of the GNU General Public License
  16 # along with this program; if not, write to the Free Software
  17 # Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
  18 #
  19 # Original Copyright (c) 2008 Adeodato Simó
  20 # Original License: MIT (See exporters/bzr-fast-export.LICENSE)
  21 #
  22 # vim: fileencoding=utf-8
  23
  24 """Core engine for the fast-export command."""
  25
  26 # TODO: if a new_git_branch below gets merged repeatedly, the tip of the branch
  27 # is not updated (because the parent of commit is already merged, so we don't
  28 # set new_git_branch to the previously used name)
  29
  30 from email.Utils import parseaddr
  31 import sys, time
  32
  33 import bzrlib.branch
  34 import bzrlib.revision
  35 from bzrlib import (
  36     builtins,
  37     errors as bazErrors,
  38     osutils,
  39     progress,
  40     trace,
  41     )
  42
  43 from bzrlib.plugins.fastimport import commands, helpers, marks_file
  44
  45
  46 class BzrFastExporter(object):
  47
  48     def __init__(self, source, destination, git_branch=None, checkpoint=-1,
  49         import_marks_file=None, export_marks_file=None, revision=None,
  50         verbose=False, plain_format=False):
  51         """Export branch data in fast import format.
  52
  53         :param plain_format: if True, 'classic' fast-import format is
  54           used without any extended features; if False, the generated
  55           data is richer and includes information like multiple
  56           authors, revision properties, etc.
  57         """
  58         self.source = source
  59         if destination is None or destination == '-':
  60             self.outf = helpers.binary_stream(sys.stdout)
  61         elif destination.endswith('gz'):
  62             import gzip
  63             self.outf = gzip.open(destination, 'wb')
  64         else:
  65             self.outf = open(destination, 'wb')
  66         self.git_branch = git_branch
  67         self.checkpoint = checkpoint
  68         self.import_marks_file = import_marks_file
  69         self.export_marks_file = export_marks_file
  70         self.revision = revision
  71         self.excluded_revisions = set()
  72         self.plain_format = plain_format
  73         self._multi_author_api_available = hasattr(bzrlib.revision.Revision,
  74             'get_apparent_authors')
  75         self.properties_to_exclude = ['authors', 'author']
  76
  77         # Progress reporting stuff
  78         self.verbose = verbose
  79         if verbose:
  80             self.progress_every = 100
  81         else:
  82             self.progress_every = 1000
  83         self._start_time = time.time()
  84         self._commit_total = 0
  85
  86         # Load the marks and initialise things accordingly
  87         self.revid_to_mark = {}
  88         self.branch_names = {}
  89         if self.import_marks_file:
  90             marks_info = marks_file.import_marks(self.import_marks_file)
  91             if marks_info is not None:
  92                 self.revid_to_mark = dict((r, m) for m, r in
  93                     marks_info[0].items())
  94                 self.branch_names = marks_info[1]
  95
  96     def interesting_history(self):
  97         if self.revision:
  98             rev1, rev2 = builtins._get_revision_range(self.revision,
  99                 self.branch, "fast-export")
 100             start_rev_id = rev1.rev_id
 101             end_rev_id = rev2.rev_id
 102         else:
 103             start_rev_id = None
 104             end_rev_id = None
 105         self.note("Calculating the revisions to include ...")
 106         view_revisions = reversed([rev_id for rev_id, _, _, _ in
 107             self.branch.iter_merge_sorted_revisions(end_rev_id, start_rev_id)])
 108         # If a starting point was given, we need to later check that we don't
 109         # start emitting revisions from before that point. Collect the
 110         # revisions to exclude now ...
 111         if start_rev_id is not None:
 112             self.note("Calculating the revisions to exclude ...")
 113             self.excluded_revisions = set([rev_id for rev_id, _, _, _ in
 114                 self.branch.iter_merge_sorted_revisions(start_rev_id)])
 115         return list(view_revisions)
 116
 117     def run(self):
 118         # Open the source
 119         self.branch = bzrlib.branch.Branch.open_containing(self.source)[0]
 120
 121         # Export the data
 122         self.branch.repository.lock_read()
 123         try:
 124             interesting = self.interesting_history()
 125             self._commit_total = len(interesting)
 126             self.note("Starting export of %d revisions ..." %
 127                 self._commit_total)
 128             if not self.plain_format:
 129                 self.emit_features()
 130             for revid in interesting:
 131                 self.emit_commit(revid, self.git_branch)
 132             if self.branch.supports_tags():
 133                 self.emit_tags()
 134         finally:
 135             self.branch.repository.unlock()
 136
 137         # Save the marks if requested
 138         self._save_marks()
 139         self.dump_stats()
 140
 141     def note(self, msg, *args):
 142         """Output a note but timestamp it."""
 143         msg = "%s %s" % (self._time_of_day(), msg)
 144         trace.note(msg, *args)
 145
 146     def warning(self, msg, *args):
 147         """Output a warning but timestamp it."""
 148         msg = "%s WARNING: %s" % (self._time_of_day(), msg)
 149         trace.warning(msg, *args)
 150
 151     def _time_of_day(self):
 152         """Time of day as a string."""
 153         # Note: this is a separate method so tests can patch in a fixed value
 154         return time.strftime("%H:%M:%S")
 155
 156     def report_progress(self, commit_count, details=''):
 157         if commit_count and commit_count % self.progress_every == 0:
 158             if self._commit_total:
 159                 counts = "%d/%d" % (commit_count, self._commit_total)
 160             else:
 161                 counts = "%d" % (commit_count,)
 162             minutes = (time.time() - self._start_time) / 60
 163             rate = commit_count * 1.0 / minutes
 164             if rate > 10:
 165                 rate_str = "at %.0f/minute " % rate
 166             else:
 167                 rate_str = "at %.1f/minute " % rate
 168             self.note("%s commits exported %s%s" % (counts, rate_str, details))
 169
 170     def dump_stats(self):
 171         time_required = progress.str_tdelta(time.time() - self._start_time)
 172         rc = len(self.revid_to_mark)
 173         self.note("Exported %d %s in %s",
 174             rc, helpers.single_plural(rc, "revision", "revisions"),
 175             time_required)
 176
 177     def print_cmd(self, cmd):
 178         self.outf.write("%r\n" % cmd)
 179
 180     def _save_marks(self):
 181         if self.export_marks_file:
 182             revision_ids = dict((m, r) for r, m in self.revid_to_mark.items())
 183             marks_file.export_marks(self.export_marks_file, revision_ids,
 184                 self.branch_names)
 185
 186     def is_empty_dir(self, tree, path):
 187         path_id = tree.path2id(path)
 188         if path_id is None:
 189             self.warning("Skipping empty_dir detection - no file_id for %s" %
 190                 (path,))
 191             return False
 192
 193         # Continue if path is not a directory
 194         if tree.kind(path_id) != 'directory':
 195             return False
 196
 197         # Use treewalk to find the contents of our directory
 198         contents = list(tree.walkdirs(prefix=path))[0]
 199         if len(contents[1]) == 0:
 200             return True
 201         else:
 202             return False
 203
 204     def emit_features(self):
 205         for feature in sorted(commands.FEATURE_NAMES):
 206             self.print_cmd(commands.FeatureCommand(feature))
 207
 208     def emit_commit(self, revid, git_branch):
 209         if revid in self.revid_to_mark or revid in self.excluded_revisions:
 210             return
 211
 212         # Get the Revision object
 213         try:
 214             revobj = self.branch.repository.get_revision(revid)
 215         except bazErrors.NoSuchRevision:
 216             # This is a ghost revision. Mark it as not found and next!
 217             self.revid_to_mark[revid] = -1
 218             return
 219
 220         # Get the primary parent
 221         # TODO: Consider the excluded revisions when deciding the parents.
 222         # Currently, a commit with parents that are excluded ought to be
 223         # triggering the git_branch calculation below (and it is not).
 224         # IGC 20090824
 225         ncommits = len(self.revid_to_mark)
 226         nparents = len(revobj.parent_ids)
 227         if nparents == 0:
 228             if ncommits:
 229                 # This is a parentless commit but it's not the first one
 230                 # output. We need to create a new temporary branch for it
 231                 # otherwise git-fast-import will assume the previous commit
 232                 # was this one's parent
 233                 git_branch = self._next_tmp_branch_name()
 234             parent = bzrlib.revision.NULL_REVISION
 235         else:
 236             parent = revobj.parent_ids[0]
 237
 238         # Print the commit
 239         git_ref = 'refs/heads/%s' % (git_branch,)
 240         mark = ncommits + 1
 241         self.revid_to_mark[revid] = mark
 242         file_cmds = self._get_filecommands(parent, revid)
 243         self.print_cmd(self._get_commit_command(git_ref, mark, revobj,
 244             file_cmds))
 245
 246         # Report progress and checkpoint if it's time for that
 247         self.report_progress(ncommits)
 248         if (self.checkpoint > 0 and ncommits
 249             and ncommits % self.checkpoint == 0):
 250             self.note("Exported %i commits - adding checkpoint to output"
 251                 % ncommits)
 252             self._save_marks()
 253             self.print_cmd(commands.CheckpointCommand())
 254
 255     def _get_name_email(self, user):
 256         if user.find('<') == -1:
 257             # If the email isn't inside <>, we need to use it as the name
 258             # in order for things to round-trip correctly.
 259             # (note: parseaddr('a@b.com') => name:'', email: 'a@b.com')
 260             name = user
 261             email = ''
 262         else:
 263             name, email = parseaddr(user)
 264         return name, email
 265
 266     def _get_commit_command(self, git_ref, mark, revobj, file_cmds):
 267         # Get the committer and author info
 268         committer = revobj.committer
 269         name, email = self._get_name_email(committer)
 270         committer_info = (name, email, revobj.timestamp, revobj.timezone)
 271         if self._multi_author_api_available:
 272             more_authors = revobj.get_apparent_authors()
 273             author = more_authors.pop(0)
 274         else:
 275             more_authors = []
 276             author = revobj.get_apparent_author()
 277         if more_authors:
 278             name, email = self._get_name_email(author)
 279             author_info = (name, email, revobj.timestamp, revobj.timezone)
 280             more_author_info = []
 281             for a in more_authors:
 282                 name, email = self._get_name_email(a)
 283                 more_author_info.append(
 284                     (name, email, revobj.timestamp, revobj.timezone))
 285         elif author != committer:
 286             name, email = self._get_name_email(author)
 287             author_info = (name, email, revobj.timestamp, revobj.timezone)
 288             more_author_info = None
 289         else:
 290             author_info = None
 291             more_author_info = None
 292
 293         # Get the parents in terms of marks
 294         non_ghost_parents = []
 295         for p in revobj.parent_ids:
 296             if p in self.excluded_revisions:
 297                 continue
 298             try:
 299                 parent_mark = self.revid_to_mark[p]
 300                 non_ghost_parents.append(":%s" % parent_mark)
 301             except KeyError:
 302                 # ghost - ignore
 303                 continue
 304         if non_ghost_parents:
 305             from_ = non_ghost_parents[0]
 306             merges = non_ghost_parents[1:]
 307         else:
 308             from_ = None
 309             merges = None
 310
 311         # Filter the revision properties. Some metadata (like the
 312         # author information) is already exposed in other ways so
 313         # don't repeat it here.
 314         if self.plain_format:
 315             properties = None
 316         else:
 317             properties = revobj.properties
 318             for prop in self.properties_to_exclude:
 319                 try:
 320                     del properties[prop]
 321                 except KeyError:
 322                     pass
 323
 324         # Build and return the result
 325         return commands.CommitCommand(git_ref, mark, author_info,
 326             committer_info, revobj.message, from_, merges, iter(file_cmds),
 327             more_authors=more_author_info, properties=properties)
 328
 329     def _get_revision_trees(self, parent, revision_id):
 330         try:
 331             tree_old = self.branch.repository.revision_tree(parent)
 332         except bazErrors.UnexpectedInventoryFormat:
 333             self.warning("Parent is malformed - diffing against previous parent")
 334             # We can't find the old parent. Let's diff against his parent
 335             pp = self.branch.repository.get_revision(parent)
 336             tree_old = self.branch.repository.revision_tree(pp.parent_ids[0])
 337         tree_new = None
 338         try:
 339             tree_new = self.branch.repository.revision_tree(revision_id)
 340         except bazErrors.UnexpectedInventoryFormat:
 341             # We can't really do anything anymore
 342             self.warning("Revision %s is malformed - skipping" % revision_id)
 343         return tree_old, tree_new
 344
 345     def _get_filecommands(self, parent, revision_id):
 346         """Get the list of FileCommands for the changes between two revisions."""
 347         tree_old, tree_new = self._get_revision_trees(parent, revision_id)
 348         if not(tree_old and tree_new):
 349             # Something is wrong with this revision - ignore the filecommands
 350             return []
 351
 352         changes = tree_new.changes_from(tree_old)
 353
 354         # Make "modified" have 3-tuples, as added does
 355         my_modified = [ x[0:3] for x in changes.modified ]
 356
 357         # The potential interaction between renames and deletes is messy.
 358         # Handle it here ...
 359         file_cmds, rd_modifies, renamed = self._process_renames_and_deletes(
 360             changes.renamed, changes.removed, revision_id, tree_old)
 361
 362         # Map kind changes to a delete followed by an add
 363         for path, id_, kind1, kind2 in changes.kind_changed:
 364             path = self._adjust_path_for_renames(path, renamed, revision_id)
 365             # IGC: I don't understand why a delete is needed here.
 366             # In fact, it seems harmful? If you uncomment this line,
 367             # please file a bug explaining why you needed to.
 368             #file_cmds.append(commands.FileDeleteCommand(path))
 369             my_modified.append((path, id_, kind2))
 370
 371         # Record modifications
 372         for path, id_, kind in changes.added + my_modified + rd_modifies:
 373             if kind == 'file':
 374                 text = tree_new.get_file_text(id_)
 375                 file_cmds.append(commands.FileModifyCommand(path, 'file',
 376                     tree_new.is_executable(id_), None, text))
 377             elif kind == 'symlink':
 378                 file_cmds.append(commands.FileModifyCommand(path, 'symlink',
 379                     False, None, tree_new.get_symlink_target(id_)))
 380             elif kind == 'directory':
 381                 if not self.plain_format:
 382                     file_cmds.append(commands.FileModifyCommand(path, 'directory',
 383                         False, None, None))
 384             else:
 385                 self.warning("cannot export '%s' of kind %s yet - ignoring" %
 386                     (path, kind))
 387         return file_cmds
 388
 389     def _process_renames_and_deletes(self, renames, deletes,
 390         revision_id, tree_old):
 391         file_cmds = []
 392         modifies = []
 393         renamed = []
 394
 395         # See https://bugs.edge.launchpad.net/bzr-fastimport/+bug/268933.
 396         # In a nutshell, there are several nasty cases:
 397         #
 398         # 1) bzr rm a; bzr mv b a; bzr commit
 399         # 2) bzr mv x/y z; bzr rm x; commmit
 400         #
 401         # The first must come out with the delete first like this:
 402         #
 403         # D a
 404         # R b a
 405         #
 406         # The second case must come out with the rename first like this:
 407         #
 408         # R x/y z
 409         # D x
 410         #
 411         # So outputting all deletes first or all renames first won't work.
 412         # Instead, we need to make multiple passes over the various lists to
 413         # get the ordering right.
 414
 415         must_be_renamed = {}
 416         old_to_new = {}
 417         deleted_paths = set([p for p, _, _ in deletes])
 418         for (oldpath, newpath, id_, kind,
 419                 text_modified, meta_modified) in renames:
 420             emit = kind != 'directory' or not self.plain_format
 421             if newpath in deleted_paths:
 422                 if emit:
 423                     file_cmds.append(commands.FileDeleteCommand(newpath))
 424                 deleted_paths.remove(newpath)
 425             if (self.is_empty_dir(tree_old, oldpath)):
 426                 self.note("Skipping empty dir %s in rev %s" % (oldpath,
 427                     revision_id))
 428                 continue
 429             #oldpath = self._adjust_path_for_renames(oldpath, renamed,
 430             #    revision_id)
 431             renamed.append([oldpath, newpath])
 432             old_to_new[oldpath] = newpath
 433             if emit:
 434                 file_cmds.append(commands.FileRenameCommand(oldpath, newpath))
 435             if text_modified or meta_modified:
 436                 modifies.append((newpath, id_, kind))
 437
 438             # Renaming a directory implies all children must be renamed.
 439             # Note: changes_from() doesn't handle this
 440             if kind == 'directory':
 441                 for p, e in tree_old.inventory.iter_entries_by_dir(from_dir=id_):
 442                     if e.kind == 'directory' and self.plain_format:
 443                         continue
 444                     old_child_path = osutils.pathjoin(oldpath, p)
 445                     new_child_path = osutils.pathjoin(newpath, p)
 446                     must_be_renamed[old_child_path] = new_child_path
 447
 448         # Add children not already renamed
 449         if must_be_renamed:
 450             renamed_already = set(old_to_new.keys())
 451             still_to_be_renamed = set(must_be_renamed.keys()) - renamed_already
 452             for old_child_path in sorted(still_to_be_renamed):
 453                 new_child_path = must_be_renamed[old_child_path]
 454                 if self.verbose:
 455                     self.note("implicitly renaming %s => %s" % (old_child_path,
 456                         new_child_path))
 457                 file_cmds.append(commands.FileRenameCommand(old_child_path,
 458                     new_child_path))
 459
 460         # Record remaining deletes
 461         for path, id_, kind in deletes:
 462             if path not in deleted_paths:
 463                 continue
 464             if kind == 'directory' and self.plain_format:
 465                 continue
 466             #path = self._adjust_path_for_renames(path, renamed, revision_id)
 467             file_cmds.append(commands.FileDeleteCommand(path))
 468         return file_cmds, modifies, renamed
 469
 470     def _adjust_path_for_renames(self, path, renamed, revision_id):
 471         # If a previous rename is found, we should adjust the path
 472         for old, new in renamed:
 473             if path == old:
 474                 self.note("Changing path %s given rename to %s in revision %s"
 475                     % (path, new, revision_id))
 476                 path = new
 477             elif path.startswith(old + '/'):
 478                 self.note(
 479                     "Adjusting path %s given rename of %s to %s in revision %s"
 480                     % (path, old, new, revision_id))
 481                 path = path.replace(old + "/", new + "/")
 482         return path
 483
 484     def emit_tags(self):
 485         for tag, revid in self.branch.tags.get_tag_dict().items():
 486             try:
 487                 mark = self.revid_to_mark[revid]
 488             except KeyError:
 489                 self.warning('not creating tag %r pointing to non-existent '
 490                     'revision %s' % (tag, revid))
 491             else:
 492                 git_ref = 'refs/tags/%s' % tag
 493                 self.print_cmd(commands.ResetCommand(git_ref, ":" + str(mark)))
 494
 495     def _next_tmp_branch_name(self):
 496         """Return a unique branch name. The name will start with "tmp"."""
 497         prefix = 'tmp'
 498         if prefix not in self.branch_names:
 499             self.branch_names[prefix] = 0
 500         else:
 501             self.branch_names[prefix] += 1
 502             prefix = '%s.%d' % (prefix, self.branch_names[prefix])
 503         return prefix