bzr_exporter.py

   1 # -*- coding: utf-8 -*-
   2
   3 # Copyright (C) 2008 Canonical Ltd
   4 #
   5 # This program is free software; you can redistribute it and/or modify
   6 # it under the terms of the GNU General Public License as published by
   7 # the Free Software Foundation; either version 2 of the License, or
   8 # (at your option) any later version.
   9 #
  10 # This program is distributed in the hope that it will be useful,
  11 # but WITHOUT ANY WARRANTY; without even the implied warranty of
  12 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  13 # GNU General Public License for more details.
  14 #
  15 # You should have received a copy of the GNU General Public License
  16 # along with this program; if not, write to the Free Software
  17 # Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
  18 #
  19 # Original Copyright (c) 2008 Adeodato Simó
  20 # Original License: MIT (See exporters/bzr-fast-export.LICENSE)
  21 #
  22 # vim: fileencoding=utf-8
  23
  24 """Core engine for the fast-export command."""
  25
  26 # TODO: if a new_git_branch below gets merged repeatedly, the tip of the branch
  27 # is not updated (because the parent of commit is already merged, so we don't
  28 # set new_git_branch to the previously used name)
  29
  30 from email.Utils import parseaddr
  31 import sys, time
  32
  33 import bzrlib.branch
  34 import bzrlib.revision
  35 from bzrlib import (
  36     builtins,
  37     errors as bazErrors,
  38     osutils,
  39     progress,
  40     trace,
  41     )
  42
  43 from bzrlib.plugins.fastimport import helpers, marks_file
  44
  45 from fastimport import commands
  46
  47 class BzrFastExporter(object):
  48
  49     def __init__(self, source, destination, git_branch=None, checkpoint=-1,
  50         import_marks_file=None, export_marks_file=None, revision=None,
  51         verbose=False, plain_format=False):
  52         """Export branch data in fast import format.
  53
  54         :param plain_format: if True, 'classic' fast-import format is
  55           used without any extended features; if False, the generated
  56           data is richer and includes information like multiple
  57           authors, revision properties, etc.
  58         """
  59         self.source = source
  60         if destination is None or destination == '-':
  61             self.outf = helpers.binary_stream(sys.stdout)
  62         elif destination.endswith('gz'):
  63             import gzip
  64             self.outf = gzip.open(destination, 'wb')
  65         else:
  66             self.outf = open(destination, 'wb')
  67         self.git_branch = git_branch
  68         self.checkpoint = checkpoint
  69         self.import_marks_file = import_marks_file
  70         self.export_marks_file = export_marks_file
  71         self.revision = revision
  72         self.excluded_revisions = set()
  73         self.plain_format = plain_format
  74         self._multi_author_api_available = hasattr(bzrlib.revision.Revision,
  75             'get_apparent_authors')
  76         self.properties_to_exclude = ['authors', 'author']
  77
  78         # Progress reporting stuff
  79         self.verbose = verbose
  80         if verbose:
  81             self.progress_every = 100
  82         else:
  83             self.progress_every = 1000
  84         self._start_time = time.time()
  85         self._commit_total = 0
  86
  87         # Load the marks and initialise things accordingly
  88         self.revid_to_mark = {}
  89         self.branch_names = {}
  90         if self.import_marks_file:
  91             marks_info = marks_file.import_marks(self.import_marks_file)
  92             if marks_info is not None:
  93                 self.revid_to_mark = dict((r, m) for m, r in
  94                     marks_info[0].items())
  95                 self.branch_names = marks_info[1]
  96
  97     def interesting_history(self):
  98         if self.revision:
  99             rev1, rev2 = builtins._get_revision_range(self.revision,
 100                 self.branch, "fast-export")
 101             start_rev_id = rev1.rev_id
 102             end_rev_id = rev2.rev_id
 103         else:
 104             start_rev_id = None
 105             end_rev_id = None
 106         self.note("Calculating the revisions to include ...")
 107         view_revisions = reversed([rev_id for rev_id, _, _, _ in
 108             self.branch.iter_merge_sorted_revisions(end_rev_id, start_rev_id)])
 109         # If a starting point was given, we need to later check that we don't
 110         # start emitting revisions from before that point. Collect the
 111         # revisions to exclude now ...
 112         if start_rev_id is not None:
 113             self.note("Calculating the revisions to exclude ...")
 114             self.excluded_revisions = set([rev_id for rev_id, _, _, _ in
 115                 self.branch.iter_merge_sorted_revisions(start_rev_id)])
 116         return list(view_revisions)
 117
 118     def run(self):
 119         # Open the source
 120         self.branch = bzrlib.branch.Branch.open_containing(self.source)[0]
 121
 122         # Export the data
 123         self.branch.repository.lock_read()
 124         try:
 125             interesting = self.interesting_history()
 126             self._commit_total = len(interesting)
 127             self.note("Starting export of %d revisions ..." %
 128                 self._commit_total)
 129             if not self.plain_format:
 130                 self.emit_features()
 131             for revid in interesting:
 132                 self.emit_commit(revid, self.git_branch)
 133             if self.branch.supports_tags():
 134                 self.emit_tags()
 135         finally:
 136             self.branch.repository.unlock()
 137
 138         # Save the marks if requested
 139         self._save_marks()
 140         self.dump_stats()
 141
 142     def note(self, msg, *args):
 143         """Output a note but timestamp it."""
 144         msg = "%s %s" % (self._time_of_day(), msg)
 145         trace.note(msg, *args)
 146
 147     def warning(self, msg, *args):
 148         """Output a warning but timestamp it."""
 149         msg = "%s WARNING: %s" % (self._time_of_day(), msg)
 150         trace.warning(msg, *args)
 151
 152     def _time_of_day(self):
 153         """Time of day as a string."""
 154         # Note: this is a separate method so tests can patch in a fixed value
 155         return time.strftime("%H:%M:%S")
 156
 157     def report_progress(self, commit_count, details=''):
 158         if commit_count and commit_count % self.progress_every == 0:
 159             if self._commit_total:
 160                 counts = "%d/%d" % (commit_count, self._commit_total)
 161             else:
 162                 counts = "%d" % (commit_count,)
 163             minutes = (time.time() - self._start_time) / 60
 164             rate = commit_count * 1.0 / minutes
 165             if rate > 10:
 166                 rate_str = "at %.0f/minute " % rate
 167             else:
 168                 rate_str = "at %.1f/minute " % rate
 169             self.note("%s commits exported %s%s" % (counts, rate_str, details))
 170
 171     def dump_stats(self):
 172         time_required = progress.str_tdelta(time.time() - self._start_time)
 173         rc = len(self.revid_to_mark)
 174         self.note("Exported %d %s in %s",
 175             rc, helpers.single_plural(rc, "revision", "revisions"),
 176             time_required)
 177
 178     def print_cmd(self, cmd):
 179         self.outf.write("%r\n" % cmd)
 180
 181     def _save_marks(self):
 182         if self.export_marks_file:
 183             revision_ids = dict((m, r) for r, m in self.revid_to_mark.items())
 184             marks_file.export_marks(self.export_marks_file, revision_ids,
 185                 self.branch_names)
 186
 187     def is_empty_dir(self, tree, path):
 188         path_id = tree.path2id(path)
 189         if path_id is None:
 190             self.warning("Skipping empty_dir detection - no file_id for %s" %
 191                 (path,))
 192             return False
 193
 194         # Continue if path is not a directory
 195         if tree.kind(path_id) != 'directory':
 196             return False
 197
 198         # Use treewalk to find the contents of our directory
 199         contents = list(tree.walkdirs(prefix=path))[0]
 200         if len(contents[1]) == 0:
 201             return True
 202         else:
 203             return False
 204
 205     def emit_features(self):
 206         for feature in sorted(commands.FEATURE_NAMES):
 207             self.print_cmd(commands.FeatureCommand(feature))
 208
 209     def emit_commit(self, revid, git_branch):
 210         if revid in self.revid_to_mark or revid in self.excluded_revisions:
 211             return
 212
 213         # Get the Revision object
 214         try:
 215             revobj = self.branch.repository.get_revision(revid)
 216         except bazErrors.NoSuchRevision:
 217             # This is a ghost revision. Mark it as not found and next!
 218             self.revid_to_mark[revid] = -1
 219             return
 220
 221         # Get the primary parent
 222         # TODO: Consider the excluded revisions when deciding the parents.
 223         # Currently, a commit with parents that are excluded ought to be
 224         # triggering the git_branch calculation below (and it is not).
 225         # IGC 20090824
 226         ncommits = len(self.revid_to_mark)
 227         nparents = len(revobj.parent_ids)
 228         if nparents == 0:
 229             if ncommits:
 230                 # This is a parentless commit but it's not the first one
 231                 # output. We need to create a new temporary branch for it
 232                 # otherwise git-fast-import will assume the previous commit
 233                 # was this one's parent
 234                 git_branch = self._next_tmp_branch_name()
 235             parent = bzrlib.revision.NULL_REVISION
 236         else:
 237             parent = revobj.parent_ids[0]
 238
 239         # Print the commit
 240         git_ref = 'refs/heads/%s' % (git_branch,)
 241         mark = ncommits + 1
 242         self.revid_to_mark[revid] = mark
 243         file_cmds = self._get_filecommands(parent, revid)
 244         self.print_cmd(self._get_commit_command(git_ref, mark, revobj,
 245             file_cmds))
 246
 247         # Report progress and checkpoint if it's time for that
 248         self.report_progress(ncommits)
 249         if (self.checkpoint > 0 and ncommits
 250             and ncommits % self.checkpoint == 0):
 251             self.note("Exported %i commits - adding checkpoint to output"
 252                 % ncommits)
 253             self._save_marks()
 254             self.print_cmd(commands.CheckpointCommand())
 255
 256     def _get_name_email(self, user):
 257         if user.find('<') == -1:
 258             # If the email isn't inside <>, we need to use it as the name
 259             # in order for things to round-trip correctly.
 260             # (note: parseaddr('a@b.com') => name:'', email: 'a@b.com')
 261             name = user
 262             email = ''
 263         else:
 264             name, email = parseaddr(user)
 265         return name, email
 266
 267     def _get_commit_command(self, git_ref, mark, revobj, file_cmds):
 268         # Get the committer and author info
 269         committer = revobj.committer
 270         name, email = self._get_name_email(committer)
 271         committer_info = (name, email, revobj.timestamp, revobj.timezone)
 272         if self._multi_author_api_available:
 273             more_authors = revobj.get_apparent_authors()
 274             author = more_authors.pop(0)
 275         else:
 276             more_authors = []
 277             author = revobj.get_apparent_author()
 278         if more_authors:
 279             name, email = self._get_name_email(author)
 280             author_info = (name, email, revobj.timestamp, revobj.timezone)
 281             more_author_info = []
 282             for a in more_authors:
 283                 name, email = self._get_name_email(a)
 284                 more_author_info.append(
 285                     (name, email, revobj.timestamp, revobj.timezone))
 286         elif author != committer:
 287             name, email = self._get_name_email(author)
 288             author_info = (name, email, revobj.timestamp, revobj.timezone)
 289             more_author_info = None
 290         else:
 291             author_info = None
 292             more_author_info = None
 293
 294         # Get the parents in terms of marks
 295         non_ghost_parents = []
 296         for p in revobj.parent_ids:
 297             if p in self.excluded_revisions:
 298                 continue
 299             try:
 300                 parent_mark = self.revid_to_mark[p]
 301                 non_ghost_parents.append(":%s" % parent_mark)
 302             except KeyError:
 303                 # ghost - ignore
 304                 continue
 305         if non_ghost_parents:
 306             from_ = non_ghost_parents[0]
 307             merges = non_ghost_parents[1:]
 308         else:
 309             from_ = None
 310             merges = None
 311
 312         # Filter the revision properties. Some metadata (like the
 313         # author information) is already exposed in other ways so
 314         # don't repeat it here.
 315         if self.plain_format:
 316             properties = None
 317         else:
 318             properties = revobj.properties
 319             for prop in self.properties_to_exclude:
 320                 try:
 321                     del properties[prop]
 322                 except KeyError:
 323                     pass
 324
 325         # Build and return the result
 326         return commands.CommitCommand(git_ref, mark, author_info,
 327             committer_info, revobj.message, from_, merges, iter(file_cmds),
 328             more_authors=more_author_info, properties=properties)
 329
 330     def _get_revision_trees(self, parent, revision_id):
 331         try:
 332             tree_old = self.branch.repository.revision_tree(parent)
 333         except bazErrors.UnexpectedInventoryFormat:
 334             self.warning("Parent is malformed - diffing against previous parent")
 335             # We can't find the old parent. Let's diff against his parent
 336             pp = self.branch.repository.get_revision(parent)
 337             tree_old = self.branch.repository.revision_tree(pp.parent_ids[0])
 338         tree_new = None
 339         try:
 340             tree_new = self.branch.repository.revision_tree(revision_id)
 341         except bazErrors.UnexpectedInventoryFormat:
 342             # We can't really do anything anymore
 343             self.warning("Revision %s is malformed - skipping" % revision_id)
 344         return tree_old, tree_new
 345
 346     def _get_filecommands(self, parent, revision_id):
 347         """Get the list of FileCommands for the changes between two revisions."""
 348         tree_old, tree_new = self._get_revision_trees(parent, revision_id)
 349         if not(tree_old and tree_new):
 350             # Something is wrong with this revision - ignore the filecommands
 351             return []
 352
 353         changes = tree_new.changes_from(tree_old)
 354
 355         # Make "modified" have 3-tuples, as added does
 356         my_modified = [ x[0:3] for x in changes.modified ]
 357
 358         # The potential interaction between renames and deletes is messy.
 359         # Handle it here ...
 360         file_cmds, rd_modifies, renamed = self._process_renames_and_deletes(
 361             changes.renamed, changes.removed, revision_id, tree_old)
 362
 363         # Map kind changes to a delete followed by an add
 364         for path, id_, kind1, kind2 in changes.kind_changed:
 365             path = self._adjust_path_for_renames(path, renamed, revision_id)
 366             # IGC: I don't understand why a delete is needed here.
 367             # In fact, it seems harmful? If you uncomment this line,
 368             # please file a bug explaining why you needed to.
 369             #file_cmds.append(commands.FileDeleteCommand(path))
 370             my_modified.append((path, id_, kind2))
 371
 372         # Record modifications
 373         for path, id_, kind in changes.added + my_modified + rd_modifies:
 374             if kind == 'file':
 375                 text = tree_new.get_file_text(id_)
 376                 file_cmds.append(commands.FileModifyCommand(path,
 377                     helpers.kind_to_mode('file', tree_new.is_executable(id_)),
 378                     None, text))
 379             elif kind == 'symlink':
 380                 file_cmds.append(commands.FileModifyCommand(path,
 381                     helpers.kind_to_mode('symlink', False),
 382                     None, tree_new.get_symlink_target(id_)))
 383             elif kind == 'directory':
 384                 if not self.plain_format:
 385                     file_cmds.append(commands.FileModifyCommand(path,
 386                         helpers.kind_to_mode('directory', False),
 387                         None, None))
 388             else:
 389                 self.warning("cannot export '%s' of kind %s yet - ignoring" %
 390                     (path, kind))
 391         return file_cmds
 392
 393     def _process_renames_and_deletes(self, renames, deletes,
 394         revision_id, tree_old):
 395         file_cmds = []
 396         modifies = []
 397         renamed = []
 398
 399         # See https://bugs.edge.launchpad.net/bzr-fastimport/+bug/268933.
 400         # In a nutshell, there are several nasty cases:
 401         #
 402         # 1) bzr rm a; bzr mv b a; bzr commit
 403         # 2) bzr mv x/y z; bzr rm x; commmit
 404         #
 405         # The first must come out with the delete first like this:
 406         #
 407         # D a
 408         # R b a
 409         #
 410         # The second case must come out with the rename first like this:
 411         #
 412         # R x/y z
 413         # D x
 414         #
 415         # So outputting all deletes first or all renames first won't work.
 416         # Instead, we need to make multiple passes over the various lists to
 417         # get the ordering right.
 418
 419         must_be_renamed = {}
 420         old_to_new = {}
 421         deleted_paths = set([p for p, _, _ in deletes])
 422         for (oldpath, newpath, id_, kind,
 423                 text_modified, meta_modified) in renames:
 424             emit = kind != 'directory' or not self.plain_format
 425             if newpath in deleted_paths:
 426                 if emit:
 427                     file_cmds.append(commands.FileDeleteCommand(newpath))
 428                 deleted_paths.remove(newpath)
 429             if (self.is_empty_dir(tree_old, oldpath)):
 430                 self.note("Skipping empty dir %s in rev %s" % (oldpath,
 431                     revision_id))
 432                 continue
 433             #oldpath = self._adjust_path_for_renames(oldpath, renamed,
 434             #    revision_id)
 435             renamed.append([oldpath, newpath])
 436             old_to_new[oldpath] = newpath
 437             if emit:
 438                 file_cmds.append(commands.FileRenameCommand(oldpath, newpath))
 439             if text_modified or meta_modified:
 440                 modifies.append((newpath, id_, kind))
 441
 442             # Renaming a directory implies all children must be renamed.
 443             # Note: changes_from() doesn't handle this
 444             if kind == 'directory':
 445                 for p, e in tree_old.inventory.iter_entries_by_dir(from_dir=id_):
 446                     if e.kind == 'directory' and self.plain_format:
 447                         continue
 448                     old_child_path = osutils.pathjoin(oldpath, p)
 449                     new_child_path = osutils.pathjoin(newpath, p)
 450                     must_be_renamed[old_child_path] = new_child_path
 451
 452         # Add children not already renamed
 453         if must_be_renamed:
 454             renamed_already = set(old_to_new.keys())
 455             still_to_be_renamed = set(must_be_renamed.keys()) - renamed_already
 456             for old_child_path in sorted(still_to_be_renamed):
 457                 new_child_path = must_be_renamed[old_child_path]
 458                 if self.verbose:
 459                     self.note("implicitly renaming %s => %s" % (old_child_path,
 460                         new_child_path))
 461                 file_cmds.append(commands.FileRenameCommand(old_child_path,
 462                     new_child_path))
 463
 464         # Record remaining deletes
 465         for path, id_, kind in deletes:
 466             if path not in deleted_paths:
 467                 continue
 468             if kind == 'directory' and self.plain_format:
 469                 continue
 470             #path = self._adjust_path_for_renames(path, renamed, revision_id)
 471             file_cmds.append(commands.FileDeleteCommand(path))
 472         return file_cmds, modifies, renamed
 473
 474     def _adjust_path_for_renames(self, path, renamed, revision_id):
 475         # If a previous rename is found, we should adjust the path
 476         for old, new in renamed:
 477             if path == old:
 478                 self.note("Changing path %s given rename to %s in revision %s"
 479                     % (path, new, revision_id))
 480                 path = new
 481             elif path.startswith(old + '/'):
 482                 self.note(
 483                     "Adjusting path %s given rename of %s to %s in revision %s"
 484                     % (path, old, new, revision_id))
 485                 path = path.replace(old + "/", new + "/")
 486         return path
 487
 488     def emit_tags(self):
 489         for tag, revid in self.branch.tags.get_tag_dict().items():
 490             try:
 491                 mark = self.revid_to_mark[revid]
 492             except KeyError:
 493                 self.warning('not creating tag %r pointing to non-existent '
 494                     'revision %s' % (tag, revid))
 495             else:
 496                 git_ref = 'refs/tags/%s' % tag
 497                 self.print_cmd(commands.ResetCommand(git_ref, ":" + str(mark)))
 498
 499     def _next_tmp_branch_name(self):
 500         """Return a unique branch name. The name will start with "tmp"."""
 501         prefix = 'tmp'
 502         if prefix not in self.branch_names:
 503             self.branch_names[prefix] = 0
 504         else:
 505             self.branch_names[prefix] += 1
 506             prefix = '%s.%d' % (prefix, self.branch_names[prefix])
 507         return prefix