Support get_file_text in _TreeShim.
[bzr-fastimport.git] / bzr_exporter.py
bloba1bd75b1b9da7451ab57ef8b870f0d90d2dbfba8
1 # -*- coding: utf-8 -*-
3 # Copyright (C) 2008 Canonical Ltd
5 # This program is free software; you can redistribute it and/or modify
6 # it under the terms of the GNU General Public License as published by
7 # the Free Software Foundation; either version 2 of the License, or
8 # (at your option) any later version.
10 # This program is distributed in the hope that it will be useful,
11 # but WITHOUT ANY WARRANTY; without even the implied warranty of
12 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 # GNU General Public License for more details.
15 # You should have received a copy of the GNU General Public License
16 # along with this program; if not, write to the Free Software
17 # Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
19 # Original Copyright (c) 2008 Adeodato Simó
20 # Original License: MIT (See exporters/bzr-fast-export.LICENSE)
22 # vim: fileencoding=utf-8
24 """Core engine for the fast-export command."""
26 # TODO: if a new_git_branch below gets merged repeatedly, the tip of the branch
27 # is not updated (because the parent of commit is already merged, so we don't
28 # set new_git_branch to the previously used name)
30 from email.Utils import parseaddr
31 import sys, time
33 import bzrlib.branch
34 import bzrlib.revision
35 from bzrlib import (
36 builtins,
37 errors as bazErrors,
38 osutils,
39 progress,
40 trace,
43 from bzrlib.plugins.fastimport import helpers, marks_file
45 from fastimport import commands
47 class BzrFastExporter(object):
49 def __init__(self, source, destination, git_branch=None, checkpoint=-1,
50 import_marks_file=None, export_marks_file=None, revision=None,
51 verbose=False, plain_format=False):
52 """Export branch data in fast import format.
54 :param plain_format: if True, 'classic' fast-import format is
55 used without any extended features; if False, the generated
56 data is richer and includes information like multiple
57 authors, revision properties, etc.
58 """
59 self.source = source
60 if destination is None or destination == '-':
61 self.outf = helpers.binary_stream(sys.stdout)
62 elif destination.endswith('gz'):
63 import gzip
64 self.outf = gzip.open(destination, 'wb')
65 else:
66 self.outf = open(destination, 'wb')
67 self.git_branch = git_branch
68 self.checkpoint = checkpoint
69 self.import_marks_file = import_marks_file
70 self.export_marks_file = export_marks_file
71 self.revision = revision
72 self.excluded_revisions = set()
73 self.plain_format = plain_format
74 self._multi_author_api_available = hasattr(bzrlib.revision.Revision,
75 'get_apparent_authors')
76 self.properties_to_exclude = ['authors', 'author']
78 # Progress reporting stuff
79 self.verbose = verbose
80 if verbose:
81 self.progress_every = 100
82 else:
83 self.progress_every = 1000
84 self._start_time = time.time()
85 self._commit_total = 0
87 # Load the marks and initialise things accordingly
88 self.revid_to_mark = {}
89 self.branch_names = {}
90 if self.import_marks_file:
91 marks_info = marks_file.import_marks(self.import_marks_file)
92 if marks_info is not None:
93 self.revid_to_mark = dict((r, m) for m, r in
94 marks_info[0].items())
95 self.branch_names = marks_info[1]
97 def interesting_history(self):
98 if self.revision:
99 rev1, rev2 = builtins._get_revision_range(self.revision,
100 self.branch, "fast-export")
101 start_rev_id = rev1.rev_id
102 end_rev_id = rev2.rev_id
103 else:
104 start_rev_id = None
105 end_rev_id = None
106 self.note("Calculating the revisions to include ...")
107 view_revisions = reversed([rev_id for rev_id, _, _, _ in
108 self.branch.iter_merge_sorted_revisions(end_rev_id, start_rev_id)])
109 # If a starting point was given, we need to later check that we don't
110 # start emitting revisions from before that point. Collect the
111 # revisions to exclude now ...
112 if start_rev_id is not None:
113 self.note("Calculating the revisions to exclude ...")
114 self.excluded_revisions = set([rev_id for rev_id, _, _, _ in
115 self.branch.iter_merge_sorted_revisions(start_rev_id)])
116 return list(view_revisions)
118 def run(self):
119 # Open the source
120 self.branch = bzrlib.branch.Branch.open_containing(self.source)[0]
122 # Export the data
123 self.branch.repository.lock_read()
124 try:
125 interesting = self.interesting_history()
126 self._commit_total = len(interesting)
127 self.note("Starting export of %d revisions ..." %
128 self._commit_total)
129 if not self.plain_format:
130 self.emit_features()
131 for revid in interesting:
132 self.emit_commit(revid, self.git_branch)
133 if self.branch.supports_tags():
134 self.emit_tags()
135 finally:
136 self.branch.repository.unlock()
138 # Save the marks if requested
139 self._save_marks()
140 self.dump_stats()
142 def note(self, msg, *args):
143 """Output a note but timestamp it."""
144 msg = "%s %s" % (self._time_of_day(), msg)
145 trace.note(msg, *args)
147 def warning(self, msg, *args):
148 """Output a warning but timestamp it."""
149 msg = "%s WARNING: %s" % (self._time_of_day(), msg)
150 trace.warning(msg, *args)
152 def _time_of_day(self):
153 """Time of day as a string."""
154 # Note: this is a separate method so tests can patch in a fixed value
155 return time.strftime("%H:%M:%S")
157 def report_progress(self, commit_count, details=''):
158 if commit_count and commit_count % self.progress_every == 0:
159 if self._commit_total:
160 counts = "%d/%d" % (commit_count, self._commit_total)
161 else:
162 counts = "%d" % (commit_count,)
163 minutes = (time.time() - self._start_time) / 60
164 rate = commit_count * 1.0 / minutes
165 if rate > 10:
166 rate_str = "at %.0f/minute " % rate
167 else:
168 rate_str = "at %.1f/minute " % rate
169 self.note("%s commits exported %s%s" % (counts, rate_str, details))
171 def dump_stats(self):
172 time_required = progress.str_tdelta(time.time() - self._start_time)
173 rc = len(self.revid_to_mark)
174 self.note("Exported %d %s in %s",
175 rc, helpers.single_plural(rc, "revision", "revisions"),
176 time_required)
178 def print_cmd(self, cmd):
179 self.outf.write("%r\n" % cmd)
181 def _save_marks(self):
182 if self.export_marks_file:
183 revision_ids = dict((m, r) for r, m in self.revid_to_mark.items())
184 marks_file.export_marks(self.export_marks_file, revision_ids,
185 self.branch_names)
187 def is_empty_dir(self, tree, path):
188 path_id = tree.path2id(path)
189 if path_id is None:
190 self.warning("Skipping empty_dir detection - no file_id for %s" %
191 (path,))
192 return False
194 # Continue if path is not a directory
195 if tree.kind(path_id) != 'directory':
196 return False
198 # Use treewalk to find the contents of our directory
199 contents = list(tree.walkdirs(prefix=path))[0]
200 if len(contents[1]) == 0:
201 return True
202 else:
203 return False
205 def emit_features(self):
206 for feature in sorted(commands.FEATURE_NAMES):
207 self.print_cmd(commands.FeatureCommand(feature))
209 def emit_commit(self, revid, git_branch):
210 if revid in self.revid_to_mark or revid in self.excluded_revisions:
211 return
213 # Get the Revision object
214 try:
215 revobj = self.branch.repository.get_revision(revid)
216 except bazErrors.NoSuchRevision:
217 # This is a ghost revision. Mark it as not found and next!
218 self.revid_to_mark[revid] = -1
219 return
221 # Get the primary parent
222 # TODO: Consider the excluded revisions when deciding the parents.
223 # Currently, a commit with parents that are excluded ought to be
224 # triggering the git_branch calculation below (and it is not).
225 # IGC 20090824
226 ncommits = len(self.revid_to_mark)
227 nparents = len(revobj.parent_ids)
228 if nparents == 0:
229 if ncommits:
230 # This is a parentless commit but it's not the first one
231 # output. We need to create a new temporary branch for it
232 # otherwise git-fast-import will assume the previous commit
233 # was this one's parent
234 git_branch = self._next_tmp_branch_name()
235 parent = bzrlib.revision.NULL_REVISION
236 else:
237 parent = revobj.parent_ids[0]
239 # Print the commit
240 git_ref = 'refs/heads/%s' % (git_branch,)
241 mark = ncommits + 1
242 self.revid_to_mark[revid] = mark
243 file_cmds = self._get_filecommands(parent, revid)
244 self.print_cmd(self._get_commit_command(git_ref, mark, revobj,
245 file_cmds))
247 # Report progress and checkpoint if it's time for that
248 self.report_progress(ncommits)
249 if (self.checkpoint > 0 and ncommits
250 and ncommits % self.checkpoint == 0):
251 self.note("Exported %i commits - adding checkpoint to output"
252 % ncommits)
253 self._save_marks()
254 self.print_cmd(commands.CheckpointCommand())
256 def _get_name_email(self, user):
257 if user.find('<') == -1:
258 # If the email isn't inside <>, we need to use it as the name
259 # in order for things to round-trip correctly.
260 # (note: parseaddr('a@b.com') => name:'', email: 'a@b.com')
261 name = user
262 email = ''
263 else:
264 name, email = parseaddr(user)
265 return name, email
267 def _get_commit_command(self, git_ref, mark, revobj, file_cmds):
268 # Get the committer and author info
269 committer = revobj.committer
270 name, email = self._get_name_email(committer)
271 committer_info = (name, email, revobj.timestamp, revobj.timezone)
272 if self._multi_author_api_available:
273 more_authors = revobj.get_apparent_authors()
274 author = more_authors.pop(0)
275 else:
276 more_authors = []
277 author = revobj.get_apparent_author()
278 if more_authors:
279 name, email = self._get_name_email(author)
280 author_info = (name, email, revobj.timestamp, revobj.timezone)
281 more_author_info = []
282 for a in more_authors:
283 name, email = self._get_name_email(a)
284 more_author_info.append(
285 (name, email, revobj.timestamp, revobj.timezone))
286 elif author != committer:
287 name, email = self._get_name_email(author)
288 author_info = (name, email, revobj.timestamp, revobj.timezone)
289 more_author_info = None
290 else:
291 author_info = None
292 more_author_info = None
294 # Get the parents in terms of marks
295 non_ghost_parents = []
296 for p in revobj.parent_ids:
297 if p in self.excluded_revisions:
298 continue
299 try:
300 parent_mark = self.revid_to_mark[p]
301 non_ghost_parents.append(":%s" % parent_mark)
302 except KeyError:
303 # ghost - ignore
304 continue
305 if non_ghost_parents:
306 from_ = non_ghost_parents[0]
307 merges = non_ghost_parents[1:]
308 else:
309 from_ = None
310 merges = None
312 # Filter the revision properties. Some metadata (like the
313 # author information) is already exposed in other ways so
314 # don't repeat it here.
315 if self.plain_format:
316 properties = None
317 else:
318 properties = revobj.properties
319 for prop in self.properties_to_exclude:
320 try:
321 del properties[prop]
322 except KeyError:
323 pass
325 # Build and return the result
326 return commands.CommitCommand(git_ref, mark, author_info,
327 committer_info, revobj.message, from_, merges, iter(file_cmds),
328 more_authors=more_author_info, properties=properties)
330 def _get_revision_trees(self, parent, revision_id):
331 try:
332 tree_old = self.branch.repository.revision_tree(parent)
333 except bazErrors.UnexpectedInventoryFormat:
334 self.warning("Parent is malformed - diffing against previous parent")
335 # We can't find the old parent. Let's diff against his parent
336 pp = self.branch.repository.get_revision(parent)
337 tree_old = self.branch.repository.revision_tree(pp.parent_ids[0])
338 tree_new = None
339 try:
340 tree_new = self.branch.repository.revision_tree(revision_id)
341 except bazErrors.UnexpectedInventoryFormat:
342 # We can't really do anything anymore
343 self.warning("Revision %s is malformed - skipping" % revision_id)
344 return tree_old, tree_new
346 def _get_filecommands(self, parent, revision_id):
347 """Get the list of FileCommands for the changes between two revisions."""
348 tree_old, tree_new = self._get_revision_trees(parent, revision_id)
349 if not(tree_old and tree_new):
350 # Something is wrong with this revision - ignore the filecommands
351 return []
353 changes = tree_new.changes_from(tree_old)
355 # Make "modified" have 3-tuples, as added does
356 my_modified = [ x[0:3] for x in changes.modified ]
358 # The potential interaction between renames and deletes is messy.
359 # Handle it here ...
360 file_cmds, rd_modifies, renamed = self._process_renames_and_deletes(
361 changes.renamed, changes.removed, revision_id, tree_old)
363 # Map kind changes to a delete followed by an add
364 for path, id_, kind1, kind2 in changes.kind_changed:
365 path = self._adjust_path_for_renames(path, renamed, revision_id)
366 # IGC: I don't understand why a delete is needed here.
367 # In fact, it seems harmful? If you uncomment this line,
368 # please file a bug explaining why you needed to.
369 #file_cmds.append(commands.FileDeleteCommand(path))
370 my_modified.append((path, id_, kind2))
372 # Record modifications
373 for path, id_, kind in changes.added + my_modified + rd_modifies:
374 if kind == 'file':
375 text = tree_new.get_file_text(id_)
376 file_cmds.append(commands.FileModifyCommand(path,
377 helpers.kind_to_mode('file', tree_new.is_executable(id_)),
378 None, text))
379 elif kind == 'symlink':
380 file_cmds.append(commands.FileModifyCommand(path,
381 helpers.kind_to_mode('symlink', False),
382 None, tree_new.get_symlink_target(id_)))
383 elif kind == 'directory':
384 if not self.plain_format:
385 file_cmds.append(commands.FileModifyCommand(path,
386 helpers.kind_to_mode('directory', False),
387 None, None))
388 else:
389 self.warning("cannot export '%s' of kind %s yet - ignoring" %
390 (path, kind))
391 return file_cmds
393 def _process_renames_and_deletes(self, renames, deletes,
394 revision_id, tree_old):
395 file_cmds = []
396 modifies = []
397 renamed = []
399 # See https://bugs.edge.launchpad.net/bzr-fastimport/+bug/268933.
400 # In a nutshell, there are several nasty cases:
402 # 1) bzr rm a; bzr mv b a; bzr commit
403 # 2) bzr mv x/y z; bzr rm x; commmit
405 # The first must come out with the delete first like this:
407 # D a
408 # R b a
410 # The second case must come out with the rename first like this:
412 # R x/y z
413 # D x
415 # So outputting all deletes first or all renames first won't work.
416 # Instead, we need to make multiple passes over the various lists to
417 # get the ordering right.
419 must_be_renamed = {}
420 old_to_new = {}
421 deleted_paths = set([p for p, _, _ in deletes])
422 for (oldpath, newpath, id_, kind,
423 text_modified, meta_modified) in renames:
424 emit = kind != 'directory' or not self.plain_format
425 if newpath in deleted_paths:
426 if emit:
427 file_cmds.append(commands.FileDeleteCommand(newpath))
428 deleted_paths.remove(newpath)
429 if (self.is_empty_dir(tree_old, oldpath)):
430 self.note("Skipping empty dir %s in rev %s" % (oldpath,
431 revision_id))
432 continue
433 #oldpath = self._adjust_path_for_renames(oldpath, renamed,
434 # revision_id)
435 renamed.append([oldpath, newpath])
436 old_to_new[oldpath] = newpath
437 if emit:
438 file_cmds.append(commands.FileRenameCommand(oldpath, newpath))
439 if text_modified or meta_modified:
440 modifies.append((newpath, id_, kind))
442 # Renaming a directory implies all children must be renamed.
443 # Note: changes_from() doesn't handle this
444 if kind == 'directory':
445 for p, e in tree_old.inventory.iter_entries_by_dir(from_dir=id_):
446 if e.kind == 'directory' and self.plain_format:
447 continue
448 old_child_path = osutils.pathjoin(oldpath, p)
449 new_child_path = osutils.pathjoin(newpath, p)
450 must_be_renamed[old_child_path] = new_child_path
452 # Add children not already renamed
453 if must_be_renamed:
454 renamed_already = set(old_to_new.keys())
455 still_to_be_renamed = set(must_be_renamed.keys()) - renamed_already
456 for old_child_path in sorted(still_to_be_renamed):
457 new_child_path = must_be_renamed[old_child_path]
458 if self.verbose:
459 self.note("implicitly renaming %s => %s" % (old_child_path,
460 new_child_path))
461 file_cmds.append(commands.FileRenameCommand(old_child_path,
462 new_child_path))
464 # Record remaining deletes
465 for path, id_, kind in deletes:
466 if path not in deleted_paths:
467 continue
468 if kind == 'directory' and self.plain_format:
469 continue
470 #path = self._adjust_path_for_renames(path, renamed, revision_id)
471 file_cmds.append(commands.FileDeleteCommand(path))
472 return file_cmds, modifies, renamed
474 def _adjust_path_for_renames(self, path, renamed, revision_id):
475 # If a previous rename is found, we should adjust the path
476 for old, new in renamed:
477 if path == old:
478 self.note("Changing path %s given rename to %s in revision %s"
479 % (path, new, revision_id))
480 path = new
481 elif path.startswith(old + '/'):
482 self.note(
483 "Adjusting path %s given rename of %s to %s in revision %s"
484 % (path, old, new, revision_id))
485 path = path.replace(old + "/", new + "/")
486 return path
488 def emit_tags(self):
489 for tag, revid in self.branch.tags.get_tag_dict().items():
490 try:
491 mark = self.revid_to_mark[revid]
492 except KeyError:
493 self.warning('not creating tag %r pointing to non-existent '
494 'revision %s' % (tag, revid))
495 else:
496 git_ref = 'refs/tags/%s' % tag
497 self.print_cmd(commands.ResetCommand(git_ref, ":" + str(mark)))
499 def _next_tmp_branch_name(self):
500 """Return a unique branch name. The name will start with "tmp"."""
501 prefix = 'tmp'
502 if prefix not in self.branch_names:
503 self.branch_names[prefix] = 0
504 else:
505 self.branch_names[prefix] += 1
506 prefix = '%s.%d' % (prefix, self.branch_names[prefix])
507 return prefix