Cope with non-ascii characters in symbolic links.
[bzr-fastimport.git] / exporter.py
blobd122442561239cc42d2b7fa38640cf4df70230b1
1 # -*- coding: utf-8 -*-
3 # Copyright (C) 2008 Canonical Ltd
5 # This program is free software; you can redistribute it and/or modify
6 # it under the terms of the GNU General Public License as published by
7 # the Free Software Foundation; either version 2 of the License, or
8 # (at your option) any later version.
10 # This program is distributed in the hope that it will be useful,
11 # but WITHOUT ANY WARRANTY; without even the implied warranty of
12 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 # GNU General Public License for more details.
15 # You should have received a copy of the GNU General Public License
16 # along with this program; if not, write to the Free Software
17 # Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
19 # Original Copyright (c) 2008 Adeodato Simó
20 # Original License: MIT (See exporters/bzr-fast-export.LICENSE)
22 # vim: fileencoding=utf-8
24 """Core engine for the fast-export command."""
26 # TODO: if a new_git_branch below gets merged repeatedly, the tip of the branch
27 # is not updated (because the parent of commit is already merged, so we don't
28 # set new_git_branch to the previously used name)
30 from email.Utils import parseaddr
31 import sys, time
33 import bzrlib.branch
34 import bzrlib.revision
35 from bzrlib import (
36 builtins,
37 errors as bazErrors,
38 osutils,
39 progress,
40 trace,
43 from bzrlib.plugins.fastimport import (
44 helpers,
45 marks_file,
48 from fastimport import commands
49 from fastimport.helpers import (
50 binary_stream,
51 single_plural,
55 def _get_output_stream(destination):
56 if destination is None or destination == '-':
57 return binary_stream(sys.stdout)
58 elif destination.endswith('gz'):
59 import gzip
60 return gzip.open(destination, 'wb')
61 else:
62 return open(destination, 'wb')
65 class BzrFastExporter(object):
67 def __init__(self, source, destination, git_branch=None, checkpoint=-1,
68 import_marks_file=None, export_marks_file=None, revision=None,
69 verbose=False, plain_format=False):
70 """Export branch data in fast import format.
72 :param plain_format: if True, 'classic' fast-import format is
73 used without any extended features; if False, the generated
74 data is richer and includes information like multiple
75 authors, revision properties, etc.
76 """
77 self.source = source
78 self.outf = _get_output_stream(destination)
79 self.git_branch = git_branch
80 self.checkpoint = checkpoint
81 self.import_marks_file = import_marks_file
82 self.export_marks_file = export_marks_file
83 self.revision = revision
84 self.excluded_revisions = set()
85 self.plain_format = plain_format
86 self._multi_author_api_available = hasattr(bzrlib.revision.Revision,
87 'get_apparent_authors')
88 self.properties_to_exclude = ['authors', 'author']
90 # Progress reporting stuff
91 self.verbose = verbose
92 if verbose:
93 self.progress_every = 100
94 else:
95 self.progress_every = 1000
96 self._start_time = time.time()
97 self._commit_total = 0
99 # Load the marks and initialise things accordingly
100 self.revid_to_mark = {}
101 self.branch_names = {}
102 if self.import_marks_file:
103 marks_info = marks_file.import_marks(self.import_marks_file)
104 if marks_info is not None:
105 self.revid_to_mark = dict((r, m) for m, r in
106 marks_info[0].items())
107 self.branch_names = marks_info[1]
109 def interesting_history(self):
110 if self.revision:
111 rev1, rev2 = builtins._get_revision_range(self.revision,
112 self.branch, "fast-export")
113 start_rev_id = rev1.rev_id
114 end_rev_id = rev2.rev_id
115 else:
116 start_rev_id = None
117 end_rev_id = None
118 self.note("Calculating the revisions to include ...")
119 view_revisions = reversed([rev_id for rev_id, _, _, _ in
120 self.branch.iter_merge_sorted_revisions(end_rev_id, start_rev_id)])
121 # If a starting point was given, we need to later check that we don't
122 # start emitting revisions from before that point. Collect the
123 # revisions to exclude now ...
124 if start_rev_id is not None:
125 self.note("Calculating the revisions to exclude ...")
126 self.excluded_revisions = set([rev_id for rev_id, _, _, _ in
127 self.branch.iter_merge_sorted_revisions(start_rev_id)])
128 return list(view_revisions)
130 def run(self):
131 # Open the source
132 self.branch = bzrlib.branch.Branch.open_containing(self.source)[0]
134 # Export the data
135 self.branch.repository.lock_read()
136 try:
137 interesting = self.interesting_history()
138 self._commit_total = len(interesting)
139 self.note("Starting export of %d revisions ..." %
140 self._commit_total)
141 if not self.plain_format:
142 self.emit_features()
143 for revid in interesting:
144 self.emit_commit(revid, self.git_branch)
145 if self.branch.supports_tags():
146 self.emit_tags()
147 finally:
148 self.branch.repository.unlock()
150 # Save the marks if requested
151 self._save_marks()
152 self.dump_stats()
154 def note(self, msg, *args):
155 """Output a note but timestamp it."""
156 msg = "%s %s" % (self._time_of_day(), msg)
157 trace.note(msg, *args)
159 def warning(self, msg, *args):
160 """Output a warning but timestamp it."""
161 msg = "%s WARNING: %s" % (self._time_of_day(), msg)
162 trace.warning(msg, *args)
164 def _time_of_day(self):
165 """Time of day as a string."""
166 # Note: this is a separate method so tests can patch in a fixed value
167 return time.strftime("%H:%M:%S")
169 def report_progress(self, commit_count, details=''):
170 if commit_count and commit_count % self.progress_every == 0:
171 if self._commit_total:
172 counts = "%d/%d" % (commit_count, self._commit_total)
173 else:
174 counts = "%d" % (commit_count,)
175 minutes = (time.time() - self._start_time) / 60
176 rate = commit_count * 1.0 / minutes
177 if rate > 10:
178 rate_str = "at %.0f/minute " % rate
179 else:
180 rate_str = "at %.1f/minute " % rate
181 self.note("%s commits exported %s%s" % (counts, rate_str, details))
183 def dump_stats(self):
184 time_required = progress.str_tdelta(time.time() - self._start_time)
185 rc = len(self.revid_to_mark)
186 self.note("Exported %d %s in %s",
187 rc, single_plural(rc, "revision", "revisions"),
188 time_required)
190 def print_cmd(self, cmd):
191 self.outf.write("%r\n" % cmd)
193 def _save_marks(self):
194 if self.export_marks_file:
195 revision_ids = dict((m, r) for r, m in self.revid_to_mark.items())
196 marks_file.export_marks(self.export_marks_file, revision_ids,
197 self.branch_names)
199 def is_empty_dir(self, tree, path):
200 path_id = tree.path2id(path)
201 if path_id is None:
202 self.warning("Skipping empty_dir detection - no file_id for %s" %
203 (path,))
204 return False
206 # Continue if path is not a directory
207 if tree.kind(path_id) != 'directory':
208 return False
210 # Use treewalk to find the contents of our directory
211 contents = list(tree.walkdirs(prefix=path))[0]
212 if len(contents[1]) == 0:
213 return True
214 else:
215 return False
217 def emit_features(self):
218 for feature in sorted(commands.FEATURE_NAMES):
219 self.print_cmd(commands.FeatureCommand(feature))
221 def emit_commit(self, revid, git_branch):
222 if revid in self.revid_to_mark or revid in self.excluded_revisions:
223 return
225 # Get the Revision object
226 try:
227 revobj = self.branch.repository.get_revision(revid)
228 except bazErrors.NoSuchRevision:
229 # This is a ghost revision. Mark it as not found and next!
230 self.revid_to_mark[revid] = -1
231 return
233 # Get the primary parent
234 # TODO: Consider the excluded revisions when deciding the parents.
235 # Currently, a commit with parents that are excluded ought to be
236 # triggering the git_branch calculation below (and it is not).
237 # IGC 20090824
238 ncommits = len(self.revid_to_mark)
239 nparents = len(revobj.parent_ids)
240 if nparents == 0:
241 if ncommits:
242 # This is a parentless commit but it's not the first one
243 # output. We need to create a new temporary branch for it
244 # otherwise git-fast-import will assume the previous commit
245 # was this one's parent
246 git_branch = self._next_tmp_branch_name()
247 parent = bzrlib.revision.NULL_REVISION
248 else:
249 parent = revobj.parent_ids[0]
251 # Print the commit
252 git_ref = 'refs/heads/%s' % (git_branch,)
253 mark = ncommits + 1
254 self.revid_to_mark[revid] = mark
255 file_cmds = self._get_filecommands(parent, revid)
256 self.print_cmd(self._get_commit_command(git_ref, mark, revobj,
257 file_cmds))
259 # Report progress and checkpoint if it's time for that
260 self.report_progress(ncommits)
261 if (self.checkpoint > 0 and ncommits
262 and ncommits % self.checkpoint == 0):
263 self.note("Exported %i commits - adding checkpoint to output"
264 % ncommits)
265 self._save_marks()
266 self.print_cmd(commands.CheckpointCommand())
268 def _get_name_email(self, user):
269 if user.find('<') == -1:
270 # If the email isn't inside <>, we need to use it as the name
271 # in order for things to round-trip correctly.
272 # (note: parseaddr('a@b.com') => name:'', email: 'a@b.com')
273 name = user
274 email = ''
275 else:
276 name, email = parseaddr(user)
277 return name, email
279 def _get_commit_command(self, git_ref, mark, revobj, file_cmds):
280 # Get the committer and author info
281 committer = revobj.committer
282 name, email = self._get_name_email(committer)
283 committer_info = (name, email, revobj.timestamp, revobj.timezone)
284 if self._multi_author_api_available:
285 more_authors = revobj.get_apparent_authors()
286 author = more_authors.pop(0)
287 else:
288 more_authors = []
289 author = revobj.get_apparent_author()
290 if more_authors:
291 name, email = self._get_name_email(author)
292 author_info = (name, email, revobj.timestamp, revobj.timezone)
293 more_author_info = []
294 for a in more_authors:
295 name, email = self._get_name_email(a)
296 more_author_info.append(
297 (name, email, revobj.timestamp, revobj.timezone))
298 elif author != committer:
299 name, email = self._get_name_email(author)
300 author_info = (name, email, revobj.timestamp, revobj.timezone)
301 more_author_info = None
302 else:
303 author_info = None
304 more_author_info = None
306 # Get the parents in terms of marks
307 non_ghost_parents = []
308 for p in revobj.parent_ids:
309 if p in self.excluded_revisions:
310 continue
311 try:
312 parent_mark = self.revid_to_mark[p]
313 non_ghost_parents.append(":%s" % parent_mark)
314 except KeyError:
315 # ghost - ignore
316 continue
317 if non_ghost_parents:
318 from_ = non_ghost_parents[0]
319 merges = non_ghost_parents[1:]
320 else:
321 from_ = None
322 merges = None
324 # Filter the revision properties. Some metadata (like the
325 # author information) is already exposed in other ways so
326 # don't repeat it here.
327 if self.plain_format:
328 properties = None
329 else:
330 properties = revobj.properties
331 for prop in self.properties_to_exclude:
332 try:
333 del properties[prop]
334 except KeyError:
335 pass
337 # Build and return the result
338 return commands.CommitCommand(git_ref, mark, author_info,
339 committer_info, revobj.message, from_, merges, iter(file_cmds),
340 more_authors=more_author_info, properties=properties)
342 def _get_revision_trees(self, parent, revision_id):
343 try:
344 tree_old = self.branch.repository.revision_tree(parent)
345 except bazErrors.UnexpectedInventoryFormat:
346 self.warning("Parent is malformed - diffing against previous parent")
347 # We can't find the old parent. Let's diff against his parent
348 pp = self.branch.repository.get_revision(parent)
349 tree_old = self.branch.repository.revision_tree(pp.parent_ids[0])
350 tree_new = None
351 try:
352 tree_new = self.branch.repository.revision_tree(revision_id)
353 except bazErrors.UnexpectedInventoryFormat:
354 # We can't really do anything anymore
355 self.warning("Revision %s is malformed - skipping" % revision_id)
356 return tree_old, tree_new
358 def _get_filecommands(self, parent, revision_id):
359 """Get the list of FileCommands for the changes between two revisions."""
360 tree_old, tree_new = self._get_revision_trees(parent, revision_id)
361 if not(tree_old and tree_new):
362 # Something is wrong with this revision - ignore the filecommands
363 return []
365 changes = tree_new.changes_from(tree_old)
367 # Make "modified" have 3-tuples, as added does
368 my_modified = [ x[0:3] for x in changes.modified ]
370 # The potential interaction between renames and deletes is messy.
371 # Handle it here ...
372 file_cmds, rd_modifies, renamed = self._process_renames_and_deletes(
373 changes.renamed, changes.removed, revision_id, tree_old)
375 # Map kind changes to a delete followed by an add
376 for path, id_, kind1, kind2 in changes.kind_changed:
377 path = self._adjust_path_for_renames(path, renamed, revision_id)
378 # IGC: I don't understand why a delete is needed here.
379 # In fact, it seems harmful? If you uncomment this line,
380 # please file a bug explaining why you needed to.
381 #file_cmds.append(commands.FileDeleteCommand(path))
382 my_modified.append((path, id_, kind2))
384 # Record modifications
385 for path, id_, kind in changes.added + my_modified + rd_modifies:
386 if kind == 'file':
387 text = tree_new.get_file_text(id_)
388 file_cmds.append(commands.FileModifyCommand(path,
389 helpers.kind_to_mode('file', tree_new.is_executable(id_)),
390 None, text))
391 elif kind == 'symlink':
392 file_cmds.append(commands.FileModifyCommand(path,
393 helpers.kind_to_mode('symlink', False),
394 None, tree_new.get_symlink_target(id_)))
395 elif kind == 'directory':
396 if not self.plain_format:
397 file_cmds.append(commands.FileModifyCommand(path,
398 helpers.kind_to_mode('directory', False),
399 None, None))
400 else:
401 self.warning("cannot export '%s' of kind %s yet - ignoring" %
402 (path, kind))
403 return file_cmds
405 def _process_renames_and_deletes(self, renames, deletes,
406 revision_id, tree_old):
407 file_cmds = []
408 modifies = []
409 renamed = []
411 # See https://bugs.edge.launchpad.net/bzr-fastimport/+bug/268933.
412 # In a nutshell, there are several nasty cases:
414 # 1) bzr rm a; bzr mv b a; bzr commit
415 # 2) bzr mv x/y z; bzr rm x; commmit
417 # The first must come out with the delete first like this:
419 # D a
420 # R b a
422 # The second case must come out with the rename first like this:
424 # R x/y z
425 # D x
427 # So outputting all deletes first or all renames first won't work.
428 # Instead, we need to make multiple passes over the various lists to
429 # get the ordering right.
431 must_be_renamed = {}
432 old_to_new = {}
433 deleted_paths = set([p for p, _, _ in deletes])
434 for (oldpath, newpath, id_, kind,
435 text_modified, meta_modified) in renames:
436 emit = kind != 'directory' or not self.plain_format
437 if newpath in deleted_paths:
438 if emit:
439 file_cmds.append(commands.FileDeleteCommand(newpath))
440 deleted_paths.remove(newpath)
441 if (self.is_empty_dir(tree_old, oldpath)):
442 self.note("Skipping empty dir %s in rev %s" % (oldpath,
443 revision_id))
444 continue
445 #oldpath = self._adjust_path_for_renames(oldpath, renamed,
446 # revision_id)
447 renamed.append([oldpath, newpath])
448 old_to_new[oldpath] = newpath
449 if emit:
450 file_cmds.append(commands.FileRenameCommand(oldpath, newpath))
451 if text_modified or meta_modified:
452 modifies.append((newpath, id_, kind))
454 # Renaming a directory implies all children must be renamed.
455 # Note: changes_from() doesn't handle this
456 if kind == 'directory':
457 for p, e in tree_old.inventory.iter_entries_by_dir(from_dir=id_):
458 if e.kind == 'directory' and self.plain_format:
459 continue
460 old_child_path = osutils.pathjoin(oldpath, p)
461 new_child_path = osutils.pathjoin(newpath, p)
462 must_be_renamed[old_child_path] = new_child_path
464 # Add children not already renamed
465 if must_be_renamed:
466 renamed_already = set(old_to_new.keys())
467 still_to_be_renamed = set(must_be_renamed.keys()) - renamed_already
468 for old_child_path in sorted(still_to_be_renamed):
469 new_child_path = must_be_renamed[old_child_path]
470 if self.verbose:
471 self.note("implicitly renaming %s => %s" % (old_child_path,
472 new_child_path))
473 file_cmds.append(commands.FileRenameCommand(old_child_path,
474 new_child_path))
476 # Record remaining deletes
477 for path, id_, kind in deletes:
478 if path not in deleted_paths:
479 continue
480 if kind == 'directory' and self.plain_format:
481 continue
482 #path = self._adjust_path_for_renames(path, renamed, revision_id)
483 file_cmds.append(commands.FileDeleteCommand(path))
484 return file_cmds, modifies, renamed
486 def _adjust_path_for_renames(self, path, renamed, revision_id):
487 # If a previous rename is found, we should adjust the path
488 for old, new in renamed:
489 if path == old:
490 self.note("Changing path %s given rename to %s in revision %s"
491 % (path, new, revision_id))
492 path = new
493 elif path.startswith(old + '/'):
494 self.note(
495 "Adjusting path %s given rename of %s to %s in revision %s"
496 % (path, old, new, revision_id))
497 path = path.replace(old + "/", new + "/")
498 return path
500 def emit_tags(self):
501 for tag, revid in self.branch.tags.get_tag_dict().items():
502 try:
503 mark = self.revid_to_mark[revid]
504 except KeyError:
505 self.warning('not creating tag %r pointing to non-existent '
506 'revision %s' % (tag, revid))
507 else:
508 git_ref = 'refs/tags/%s' % tag.encode("utf-8")
509 self.print_cmd(commands.ResetCommand(git_ref, ":" + str(mark)))
511 def _next_tmp_branch_name(self):
512 """Return a unique branch name. The name will start with "tmp"."""
513 prefix = 'tmp'
514 if prefix not in self.branch_names:
515 self.branch_names[prefix] = 0
516 else:
517 self.branch_names[prefix] += 1
518 prefix = '%s.%d' % (prefix, self.branch_names[prefix])
519 return prefix