Remove processors, now in python-fastimport.
[bzr-fastimport.git] / bzr_exporter.py
blob8cff2ab9d5e0d1ba72433595279e9eaeaca4fae7
1 # -*- coding: utf-8 -*-
3 # Copyright (C) 2008 Canonical Ltd
5 # This program is free software; you can redistribute it and/or modify
6 # it under the terms of the GNU General Public License as published by
7 # the Free Software Foundation; either version 2 of the License, or
8 # (at your option) any later version.
10 # This program is distributed in the hope that it will be useful,
11 # but WITHOUT ANY WARRANTY; without even the implied warranty of
12 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 # GNU General Public License for more details.
15 # You should have received a copy of the GNU General Public License
16 # along with this program; if not, write to the Free Software
17 # Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
19 # Original Copyright (c) 2008 Adeodato Simó
20 # Original License: MIT (See exporters/bzr-fast-export.LICENSE)
22 # vim: fileencoding=utf-8
24 """Core engine for the fast-export command."""
26 # TODO: if a new_git_branch below gets merged repeatedly, the tip of the branch
27 # is not updated (because the parent of commit is already merged, so we don't
28 # set new_git_branch to the previously used name)
30 from email.Utils import parseaddr
31 import sys, time
33 import bzrlib.branch
34 import bzrlib.revision
35 from bzrlib import (
36 builtins,
37 errors as bazErrors,
38 osutils,
39 progress,
40 trace,
43 from bzrlib.plugins.fastimport import commands, helpers, marks_file
46 class BzrFastExporter(object):
48 def __init__(self, source, destination, git_branch=None, checkpoint=-1,
49 import_marks_file=None, export_marks_file=None, revision=None,
50 verbose=False, plain_format=False):
51 """Export branch data in fast import format.
53 :param plain_format: if True, 'classic' fast-import format is
54 used without any extended features; if False, the generated
55 data is richer and includes information like multiple
56 authors, revision properties, etc.
57 """
58 self.source = source
59 if destination is None or destination == '-':
60 self.outf = helpers.binary_stream(sys.stdout)
61 elif destination.endswith('gz'):
62 import gzip
63 self.outf = gzip.open(destination, 'wb')
64 else:
65 self.outf = open(destination, 'wb')
66 self.git_branch = git_branch
67 self.checkpoint = checkpoint
68 self.import_marks_file = import_marks_file
69 self.export_marks_file = export_marks_file
70 self.revision = revision
71 self.excluded_revisions = set()
72 self.plain_format = plain_format
73 self._multi_author_api_available = hasattr(bzrlib.revision.Revision,
74 'get_apparent_authors')
75 self.properties_to_exclude = ['authors', 'author']
77 # Progress reporting stuff
78 self.verbose = verbose
79 if verbose:
80 self.progress_every = 100
81 else:
82 self.progress_every = 1000
83 self._start_time = time.time()
84 self._commit_total = 0
86 # Load the marks and initialise things accordingly
87 self.revid_to_mark = {}
88 self.branch_names = {}
89 if self.import_marks_file:
90 marks_info = marks_file.import_marks(self.import_marks_file)
91 if marks_info is not None:
92 self.revid_to_mark = dict((r, m) for m, r in
93 marks_info[0].items())
94 self.branch_names = marks_info[1]
96 def interesting_history(self):
97 if self.revision:
98 rev1, rev2 = builtins._get_revision_range(self.revision,
99 self.branch, "fast-export")
100 start_rev_id = rev1.rev_id
101 end_rev_id = rev2.rev_id
102 else:
103 start_rev_id = None
104 end_rev_id = None
105 self.note("Calculating the revisions to include ...")
106 view_revisions = reversed([rev_id for rev_id, _, _, _ in
107 self.branch.iter_merge_sorted_revisions(end_rev_id, start_rev_id)])
108 # If a starting point was given, we need to later check that we don't
109 # start emitting revisions from before that point. Collect the
110 # revisions to exclude now ...
111 if start_rev_id is not None:
112 self.note("Calculating the revisions to exclude ...")
113 self.excluded_revisions = set([rev_id for rev_id, _, _, _ in
114 self.branch.iter_merge_sorted_revisions(start_rev_id)])
115 return list(view_revisions)
117 def run(self):
118 # Open the source
119 self.branch = bzrlib.branch.Branch.open_containing(self.source)[0]
121 # Export the data
122 self.branch.repository.lock_read()
123 try:
124 interesting = self.interesting_history()
125 self._commit_total = len(interesting)
126 self.note("Starting export of %d revisions ..." %
127 self._commit_total)
128 if not self.plain_format:
129 self.emit_features()
130 for revid in interesting:
131 self.emit_commit(revid, self.git_branch)
132 if self.branch.supports_tags():
133 self.emit_tags()
134 finally:
135 self.branch.repository.unlock()
137 # Save the marks if requested
138 self._save_marks()
139 self.dump_stats()
141 def note(self, msg, *args):
142 """Output a note but timestamp it."""
143 msg = "%s %s" % (self._time_of_day(), msg)
144 trace.note(msg, *args)
146 def warning(self, msg, *args):
147 """Output a warning but timestamp it."""
148 msg = "%s WARNING: %s" % (self._time_of_day(), msg)
149 trace.warning(msg, *args)
151 def _time_of_day(self):
152 """Time of day as a string."""
153 # Note: this is a separate method so tests can patch in a fixed value
154 return time.strftime("%H:%M:%S")
156 def report_progress(self, commit_count, details=''):
157 if commit_count and commit_count % self.progress_every == 0:
158 if self._commit_total:
159 counts = "%d/%d" % (commit_count, self._commit_total)
160 else:
161 counts = "%d" % (commit_count,)
162 minutes = (time.time() - self._start_time) / 60
163 rate = commit_count * 1.0 / minutes
164 if rate > 10:
165 rate_str = "at %.0f/minute " % rate
166 else:
167 rate_str = "at %.1f/minute " % rate
168 self.note("%s commits exported %s%s" % (counts, rate_str, details))
170 def dump_stats(self):
171 time_required = progress.str_tdelta(time.time() - self._start_time)
172 rc = len(self.revid_to_mark)
173 self.note("Exported %d %s in %s",
174 rc, helpers.single_plural(rc, "revision", "revisions"),
175 time_required)
177 def print_cmd(self, cmd):
178 self.outf.write("%r\n" % cmd)
180 def _save_marks(self):
181 if self.export_marks_file:
182 revision_ids = dict((m, r) for r, m in self.revid_to_mark.items())
183 marks_file.export_marks(self.export_marks_file, revision_ids,
184 self.branch_names)
186 def is_empty_dir(self, tree, path):
187 path_id = tree.path2id(path)
188 if path_id is None:
189 self.warning("Skipping empty_dir detection - no file_id for %s" %
190 (path,))
191 return False
193 # Continue if path is not a directory
194 if tree.kind(path_id) != 'directory':
195 return False
197 # Use treewalk to find the contents of our directory
198 contents = list(tree.walkdirs(prefix=path))[0]
199 if len(contents[1]) == 0:
200 return True
201 else:
202 return False
204 def emit_features(self):
205 for feature in sorted(commands.FEATURE_NAMES):
206 self.print_cmd(commands.FeatureCommand(feature))
208 def emit_commit(self, revid, git_branch):
209 if revid in self.revid_to_mark or revid in self.excluded_revisions:
210 return
212 # Get the Revision object
213 try:
214 revobj = self.branch.repository.get_revision(revid)
215 except bazErrors.NoSuchRevision:
216 # This is a ghost revision. Mark it as not found and next!
217 self.revid_to_mark[revid] = -1
218 return
220 # Get the primary parent
221 # TODO: Consider the excluded revisions when deciding the parents.
222 # Currently, a commit with parents that are excluded ought to be
223 # triggering the git_branch calculation below (and it is not).
224 # IGC 20090824
225 ncommits = len(self.revid_to_mark)
226 nparents = len(revobj.parent_ids)
227 if nparents == 0:
228 if ncommits:
229 # This is a parentless commit but it's not the first one
230 # output. We need to create a new temporary branch for it
231 # otherwise git-fast-import will assume the previous commit
232 # was this one's parent
233 git_branch = self._next_tmp_branch_name()
234 parent = bzrlib.revision.NULL_REVISION
235 else:
236 parent = revobj.parent_ids[0]
238 # Print the commit
239 git_ref = 'refs/heads/%s' % (git_branch,)
240 mark = ncommits + 1
241 self.revid_to_mark[revid] = mark
242 file_cmds = self._get_filecommands(parent, revid)
243 self.print_cmd(self._get_commit_command(git_ref, mark, revobj,
244 file_cmds))
246 # Report progress and checkpoint if it's time for that
247 self.report_progress(ncommits)
248 if (self.checkpoint > 0 and ncommits
249 and ncommits % self.checkpoint == 0):
250 self.note("Exported %i commits - adding checkpoint to output"
251 % ncommits)
252 self._save_marks()
253 self.print_cmd(commands.CheckpointCommand())
255 def _get_name_email(self, user):
256 if user.find('<') == -1:
257 # If the email isn't inside <>, we need to use it as the name
258 # in order for things to round-trip correctly.
259 # (note: parseaddr('a@b.com') => name:'', email: 'a@b.com')
260 name = user
261 email = ''
262 else:
263 name, email = parseaddr(user)
264 return name, email
266 def _get_commit_command(self, git_ref, mark, revobj, file_cmds):
267 # Get the committer and author info
268 committer = revobj.committer
269 name, email = self._get_name_email(committer)
270 committer_info = (name, email, revobj.timestamp, revobj.timezone)
271 if self._multi_author_api_available:
272 more_authors = revobj.get_apparent_authors()
273 author = more_authors.pop(0)
274 else:
275 more_authors = []
276 author = revobj.get_apparent_author()
277 if more_authors:
278 name, email = self._get_name_email(author)
279 author_info = (name, email, revobj.timestamp, revobj.timezone)
280 more_author_info = []
281 for a in more_authors:
282 name, email = self._get_name_email(a)
283 more_author_info.append(
284 (name, email, revobj.timestamp, revobj.timezone))
285 elif author != committer:
286 name, email = self._get_name_email(author)
287 author_info = (name, email, revobj.timestamp, revobj.timezone)
288 more_author_info = None
289 else:
290 author_info = None
291 more_author_info = None
293 # Get the parents in terms of marks
294 non_ghost_parents = []
295 for p in revobj.parent_ids:
296 if p in self.excluded_revisions:
297 continue
298 try:
299 parent_mark = self.revid_to_mark[p]
300 non_ghost_parents.append(":%s" % parent_mark)
301 except KeyError:
302 # ghost - ignore
303 continue
304 if non_ghost_parents:
305 from_ = non_ghost_parents[0]
306 merges = non_ghost_parents[1:]
307 else:
308 from_ = None
309 merges = None
311 # Filter the revision properties. Some metadata (like the
312 # author information) is already exposed in other ways so
313 # don't repeat it here.
314 if self.plain_format:
315 properties = None
316 else:
317 properties = revobj.properties
318 for prop in self.properties_to_exclude:
319 try:
320 del properties[prop]
321 except KeyError:
322 pass
324 # Build and return the result
325 return commands.CommitCommand(git_ref, mark, author_info,
326 committer_info, revobj.message, from_, merges, iter(file_cmds),
327 more_authors=more_author_info, properties=properties)
329 def _get_revision_trees(self, parent, revision_id):
330 try:
331 tree_old = self.branch.repository.revision_tree(parent)
332 except bazErrors.UnexpectedInventoryFormat:
333 self.warning("Parent is malformed - diffing against previous parent")
334 # We can't find the old parent. Let's diff against his parent
335 pp = self.branch.repository.get_revision(parent)
336 tree_old = self.branch.repository.revision_tree(pp.parent_ids[0])
337 tree_new = None
338 try:
339 tree_new = self.branch.repository.revision_tree(revision_id)
340 except bazErrors.UnexpectedInventoryFormat:
341 # We can't really do anything anymore
342 self.warning("Revision %s is malformed - skipping" % revision_id)
343 return tree_old, tree_new
345 def _get_filecommands(self, parent, revision_id):
346 """Get the list of FileCommands for the changes between two revisions."""
347 tree_old, tree_new = self._get_revision_trees(parent, revision_id)
348 if not(tree_old and tree_new):
349 # Something is wrong with this revision - ignore the filecommands
350 return []
352 changes = tree_new.changes_from(tree_old)
354 # Make "modified" have 3-tuples, as added does
355 my_modified = [ x[0:3] for x in changes.modified ]
357 # The potential interaction between renames and deletes is messy.
358 # Handle it here ...
359 file_cmds, rd_modifies, renamed = self._process_renames_and_deletes(
360 changes.renamed, changes.removed, revision_id, tree_old)
362 # Map kind changes to a delete followed by an add
363 for path, id_, kind1, kind2 in changes.kind_changed:
364 path = self._adjust_path_for_renames(path, renamed, revision_id)
365 # IGC: I don't understand why a delete is needed here.
366 # In fact, it seems harmful? If you uncomment this line,
367 # please file a bug explaining why you needed to.
368 #file_cmds.append(commands.FileDeleteCommand(path))
369 my_modified.append((path, id_, kind2))
371 # Record modifications
372 for path, id_, kind in changes.added + my_modified + rd_modifies:
373 if kind == 'file':
374 text = tree_new.get_file_text(id_)
375 file_cmds.append(commands.FileModifyCommand(path, 'file',
376 tree_new.is_executable(id_), None, text))
377 elif kind == 'symlink':
378 file_cmds.append(commands.FileModifyCommand(path, 'symlink',
379 False, None, tree_new.get_symlink_target(id_)))
380 elif kind == 'directory':
381 if not self.plain_format:
382 file_cmds.append(commands.FileModifyCommand(path, 'directory',
383 False, None, None))
384 else:
385 self.warning("cannot export '%s' of kind %s yet - ignoring" %
386 (path, kind))
387 return file_cmds
389 def _process_renames_and_deletes(self, renames, deletes,
390 revision_id, tree_old):
391 file_cmds = []
392 modifies = []
393 renamed = []
395 # See https://bugs.edge.launchpad.net/bzr-fastimport/+bug/268933.
396 # In a nutshell, there are several nasty cases:
398 # 1) bzr rm a; bzr mv b a; bzr commit
399 # 2) bzr mv x/y z; bzr rm x; commmit
401 # The first must come out with the delete first like this:
403 # D a
404 # R b a
406 # The second case must come out with the rename first like this:
408 # R x/y z
409 # D x
411 # So outputting all deletes first or all renames first won't work.
412 # Instead, we need to make multiple passes over the various lists to
413 # get the ordering right.
415 must_be_renamed = {}
416 old_to_new = {}
417 deleted_paths = set([p for p, _, _ in deletes])
418 for (oldpath, newpath, id_, kind,
419 text_modified, meta_modified) in renames:
420 emit = kind != 'directory' or not self.plain_format
421 if newpath in deleted_paths:
422 if emit:
423 file_cmds.append(commands.FileDeleteCommand(newpath))
424 deleted_paths.remove(newpath)
425 if (self.is_empty_dir(tree_old, oldpath)):
426 self.note("Skipping empty dir %s in rev %s" % (oldpath,
427 revision_id))
428 continue
429 #oldpath = self._adjust_path_for_renames(oldpath, renamed,
430 # revision_id)
431 renamed.append([oldpath, newpath])
432 old_to_new[oldpath] = newpath
433 if emit:
434 file_cmds.append(commands.FileRenameCommand(oldpath, newpath))
435 if text_modified or meta_modified:
436 modifies.append((newpath, id_, kind))
438 # Renaming a directory implies all children must be renamed.
439 # Note: changes_from() doesn't handle this
440 if kind == 'directory':
441 for p, e in tree_old.inventory.iter_entries_by_dir(from_dir=id_):
442 if e.kind == 'directory' and self.plain_format:
443 continue
444 old_child_path = osutils.pathjoin(oldpath, p)
445 new_child_path = osutils.pathjoin(newpath, p)
446 must_be_renamed[old_child_path] = new_child_path
448 # Add children not already renamed
449 if must_be_renamed:
450 renamed_already = set(old_to_new.keys())
451 still_to_be_renamed = set(must_be_renamed.keys()) - renamed_already
452 for old_child_path in sorted(still_to_be_renamed):
453 new_child_path = must_be_renamed[old_child_path]
454 if self.verbose:
455 self.note("implicitly renaming %s => %s" % (old_child_path,
456 new_child_path))
457 file_cmds.append(commands.FileRenameCommand(old_child_path,
458 new_child_path))
460 # Record remaining deletes
461 for path, id_, kind in deletes:
462 if path not in deleted_paths:
463 continue
464 if kind == 'directory' and self.plain_format:
465 continue
466 #path = self._adjust_path_for_renames(path, renamed, revision_id)
467 file_cmds.append(commands.FileDeleteCommand(path))
468 return file_cmds, modifies, renamed
470 def _adjust_path_for_renames(self, path, renamed, revision_id):
471 # If a previous rename is found, we should adjust the path
472 for old, new in renamed:
473 if path == old:
474 self.note("Changing path %s given rename to %s in revision %s"
475 % (path, new, revision_id))
476 path = new
477 elif path.startswith(old + '/'):
478 self.note(
479 "Adjusting path %s given rename of %s to %s in revision %s"
480 % (path, old, new, revision_id))
481 path = path.replace(old + "/", new + "/")
482 return path
484 def emit_tags(self):
485 for tag, revid in self.branch.tags.get_tag_dict().items():
486 try:
487 mark = self.revid_to_mark[revid]
488 except KeyError:
489 self.warning('not creating tag %r pointing to non-existent '
490 'revision %s' % (tag, revid))
491 else:
492 git_ref = 'refs/tags/%s' % tag
493 self.print_cmd(commands.ResetCommand(git_ref, ":" + str(mark)))
495 def _next_tmp_branch_name(self):
496 """Return a unique branch name. The name will start with "tmp"."""
497 prefix = 'tmp'
498 if prefix not in self.branch_names:
499 self.branch_names[prefix] = 0
500 else:
501 self.branch_names[prefix] += 1
502 prefix = '%s.%d' % (prefix, self.branch_names[prefix])
503 return prefix