Reimport some modules removed from python-fastimport 0.9.2.
[bzr-fastimport.git] / exporter.py
bloba3bc1b19b50971bb1615f30d00638383ca6e2a1d
1 # -*- coding: utf-8 -*-
3 # Copyright (C) 2008 Canonical Ltd
5 # This program is free software; you can redistribute it and/or modify
6 # it under the terms of the GNU General Public License as published by
7 # the Free Software Foundation; either version 2 of the License, or
8 # (at your option) any later version.
10 # This program is distributed in the hope that it will be useful,
11 # but WITHOUT ANY WARRANTY; without even the implied warranty of
12 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 # GNU General Public License for more details.
15 # You should have received a copy of the GNU General Public License
16 # along with this program. If not, see <http://www.gnu.org/licenses/>.
18 # Based on bzr-fast-export
19 # Copyright (c) 2008 Adeodato Simó
21 # Permission is hereby granted, free of charge, to any person obtaining
22 # a copy of this software and associated documentation files (the
23 # "Software"), to deal in the Software without restriction, including
24 # without limitation the rights to use, copy, modify, merge, publish,
25 # distribute, sublicense, and/or sell copies of the Software, and to
26 # permit persons to whom the Software is furnished to do so, subject to
27 # the following conditions:
29 # The above copyright notice and this permission notice shall be included
30 # in all copies or substantial portions of the Software.
32 # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
33 # EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
34 # MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
35 # IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
36 # CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
37 # TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
38 # SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
40 # vim: fileencoding=utf-8
42 """Core engine for the fast-export command."""
44 # TODO: if a new_git_branch below gets merged repeatedly, the tip of the branch
45 # is not updated (because the parent of commit is already merged, so we don't
46 # set new_git_branch to the previously used name)
48 from email.Utils import parseaddr
49 import sys, time, re
51 import bzrlib.branch
52 import bzrlib.revision
53 from bzrlib import (
54 builtins,
55 errors as bazErrors,
56 osutils,
57 progress,
58 trace,
61 from bzrlib.plugins.fastimport import (
62 helpers,
63 marks_file,
66 from fastimport import commands
67 from fastimport.helpers import (
68 binary_stream,
69 single_plural,
73 def _get_output_stream(destination):
74 if destination is None or destination == '-':
75 return binary_stream(sys.stdout)
76 elif destination.endswith('gz'):
77 import gzip
78 return gzip.open(destination, 'wb')
79 else:
80 return open(destination, 'wb')
82 # from dulwich.repo:
83 def check_ref_format(refname):
84 """Check if a refname is correctly formatted.
86 Implements all the same rules as git-check-ref-format[1].
88 [1] http://www.kernel.org/pub/software/scm/git/docs/git-check-ref-format.html
90 :param refname: The refname to check
91 :return: True if refname is valid, False otherwise
92 """
93 # These could be combined into one big expression, but are listed separately
94 # to parallel [1].
95 if '/.' in refname or refname.startswith('.'):
96 return False
97 if '/' not in refname:
98 return False
99 if '..' in refname:
100 return False
101 for c in refname:
102 if ord(c) < 040 or c in '\177 ~^:?*[':
103 return False
104 if refname[-1] in '/.':
105 return False
106 if refname.endswith('.lock'):
107 return False
108 if '@{' in refname:
109 return False
110 if '\\' in refname:
111 return False
112 return True
115 def sanitize_ref_name_for_git(refname):
116 """Rewrite refname so that it will be accepted by git-fast-import.
117 For the detailed rules see check_ref_format.
119 By rewriting the refname we are breaking uniqueness guarantees provided by bzr
120 so we have to manually
121 verify that resulting ref names are unique.
123 :param refname: refname to rewrite
124 :return: new refname
126 new_refname = re.sub(
127 # '/.' in refname or startswith '.'
128 r"/\.|^\."
129 # '..' in refname
130 r"|\.\."
131 # ord(c) < 040
132 r"|[" + "".join([chr(x) for x in range(040)]) + r"]"
133 # c in '\177 ~^:?*['
134 r"|[\177 ~^:?*[]"
135 # last char in "/."
136 r"|[/.]$"
137 # endswith '.lock'
138 r"|.lock$"
139 # "@{" in refname
140 r"|@{"
141 # "\\" in refname
142 r"|\\",
143 "_", refname)
144 return new_refname
147 class BzrFastExporter(object):
149 def __init__(self, source, outf, git_branch=None, checkpoint=-1,
150 import_marks_file=None, export_marks_file=None, revision=None,
151 verbose=False, plain_format=False, rewrite_tags=False,
152 baseline=False):
153 """Export branch data in fast import format.
155 :param plain_format: if True, 'classic' fast-import format is
156 used without any extended features; if False, the generated
157 data is richer and includes information like multiple
158 authors, revision properties, etc.
159 :param rewrite_tags: if True and if plain_format is set, tag names
160 will be rewritten to be git-compatible.
161 Otherwise tags which aren't valid for git will be skipped if
162 plain_format is set.
164 self.branch = source
165 self.outf = outf
166 self.git_branch = git_branch
167 self.checkpoint = checkpoint
168 self.import_marks_file = import_marks_file
169 self.export_marks_file = export_marks_file
170 self.revision = revision
171 self.excluded_revisions = set()
172 self.plain_format = plain_format
173 self.rewrite_tags = rewrite_tags
174 self.baseline = baseline
175 self._multi_author_api_available = hasattr(bzrlib.revision.Revision,
176 'get_apparent_authors')
177 self.properties_to_exclude = ['authors', 'author']
179 # Progress reporting stuff
180 self.verbose = verbose
181 if verbose:
182 self.progress_every = 100
183 else:
184 self.progress_every = 1000
185 self._start_time = time.time()
186 self._commit_total = 0
188 # Load the marks and initialise things accordingly
189 self.revid_to_mark = {}
190 self.branch_names = {}
191 if self.import_marks_file:
192 marks_info = marks_file.import_marks(self.import_marks_file)
193 if marks_info is not None:
194 self.revid_to_mark = dict((r, m) for m, r in
195 marks_info.items())
196 # These are no longer included in the marks file
197 #self.branch_names = marks_info[1]
199 def interesting_history(self):
200 if self.revision:
201 rev1, rev2 = builtins._get_revision_range(self.revision,
202 self.branch, "fast-export")
203 start_rev_id = rev1.rev_id
204 end_rev_id = rev2.rev_id
205 else:
206 start_rev_id = None
207 end_rev_id = None
208 self.note("Calculating the revisions to include ...")
209 view_revisions = [rev_id for rev_id, _, _, _ in
210 self.branch.iter_merge_sorted_revisions(end_rev_id, start_rev_id)]
211 view_revisions.reverse()
212 # If a starting point was given, we need to later check that we don't
213 # start emitting revisions from before that point. Collect the
214 # revisions to exclude now ...
215 if start_rev_id is not None:
216 self.note("Calculating the revisions to exclude ...")
217 self.excluded_revisions = set([rev_id for rev_id, _, _, _ in
218 self.branch.iter_merge_sorted_revisions(start_rev_id)])
219 if self.baseline:
220 # needed so the first relative commit knows its parent
221 self.excluded_revisions.remove(start_rev_id)
222 view_revisions.insert(0, start_rev_id)
223 return list(view_revisions)
225 def run(self):
226 # Export the data
227 self.branch.repository.lock_read()
228 try:
229 interesting = self.interesting_history()
230 self._commit_total = len(interesting)
231 self.note("Starting export of %d revisions ..." %
232 self._commit_total)
233 if not self.plain_format:
234 self.emit_features()
235 if self.baseline:
236 self.emit_baseline(interesting.pop(0), self.git_branch)
237 for revid in interesting:
238 self.emit_commit(revid, self.git_branch)
239 if self.branch.supports_tags():
240 self.emit_tags()
241 finally:
242 self.branch.repository.unlock()
244 # Save the marks if requested
245 self._save_marks()
246 self.dump_stats()
248 def note(self, msg, *args):
249 """Output a note but timestamp it."""
250 msg = "%s %s" % (self._time_of_day(), msg)
251 trace.note(msg, *args)
253 def warning(self, msg, *args):
254 """Output a warning but timestamp it."""
255 msg = "%s WARNING: %s" % (self._time_of_day(), msg)
256 trace.warning(msg, *args)
258 def _time_of_day(self):
259 """Time of day as a string."""
260 # Note: this is a separate method so tests can patch in a fixed value
261 return time.strftime("%H:%M:%S")
263 def report_progress(self, commit_count, details=''):
264 if commit_count and commit_count % self.progress_every == 0:
265 if self._commit_total:
266 counts = "%d/%d" % (commit_count, self._commit_total)
267 else:
268 counts = "%d" % (commit_count,)
269 minutes = (time.time() - self._start_time) / 60
270 rate = commit_count * 1.0 / minutes
271 if rate > 10:
272 rate_str = "at %.0f/minute " % rate
273 else:
274 rate_str = "at %.1f/minute " % rate
275 self.note("%s commits exported %s%s" % (counts, rate_str, details))
277 def dump_stats(self):
278 time_required = progress.str_tdelta(time.time() - self._start_time)
279 rc = len(self.revid_to_mark)
280 self.note("Exported %d %s in %s",
281 rc, single_plural(rc, "revision", "revisions"),
282 time_required)
284 def print_cmd(self, cmd):
285 self.outf.write("%r\n" % cmd)
287 def _save_marks(self):
288 if self.export_marks_file:
289 revision_ids = dict((m, r) for r, m in self.revid_to_mark.items())
290 marks_file.export_marks(self.export_marks_file, revision_ids)
292 def is_empty_dir(self, tree, path):
293 path_id = tree.path2id(path)
294 if path_id is None:
295 self.warning("Skipping empty_dir detection - no file_id for %s" %
296 (path,))
297 return False
299 # Continue if path is not a directory
300 if tree.kind(path_id) != 'directory':
301 return False
303 # Use treewalk to find the contents of our directory
304 contents = list(tree.walkdirs(prefix=path))[0]
305 if len(contents[1]) == 0:
306 return True
307 else:
308 return False
310 def emit_features(self):
311 for feature in sorted(commands.FEATURE_NAMES):
312 self.print_cmd(commands.FeatureCommand(feature))
314 def emit_baseline(self, revid, git_branch):
315 # Emit a full source tree of the first commit's parent
316 git_ref = 'refs/heads/%s' % (git_branch,)
317 revobj = self.branch.repository.get_revision(revid)
318 mark = 1
319 self.revid_to_mark[revid] = mark
320 file_cmds = self._get_filecommands(bzrlib.revision.NULL_REVISION, revid)
321 self.print_cmd(self._get_commit_command(git_ref, mark, revobj,
322 file_cmds))
324 def emit_commit(self, revid, git_branch):
325 if revid in self.revid_to_mark or revid in self.excluded_revisions:
326 return
328 # Get the Revision object
329 try:
330 revobj = self.branch.repository.get_revision(revid)
331 except bazErrors.NoSuchRevision:
332 # This is a ghost revision. Mark it as not found and next!
333 self.revid_to_mark[revid] = -1
334 return
336 # Get the primary parent
337 # TODO: Consider the excluded revisions when deciding the parents.
338 # Currently, a commit with parents that are excluded ought to be
339 # triggering the git_branch calculation below (and it is not).
340 # IGC 20090824
341 ncommits = len(self.revid_to_mark)
342 nparents = len(revobj.parent_ids)
343 if nparents == 0:
344 if ncommits:
345 # This is a parentless commit but it's not the first one
346 # output. We need to create a new temporary branch for it
347 # otherwise git-fast-import will assume the previous commit
348 # was this one's parent
349 git_branch = self._next_tmp_branch_name()
350 parent = bzrlib.revision.NULL_REVISION
351 else:
352 parent = revobj.parent_ids[0]
354 # Print the commit
355 git_ref = 'refs/heads/%s' % (git_branch,)
356 mark = ncommits + 1
357 self.revid_to_mark[revid] = mark
358 file_cmds = self._get_filecommands(parent, revid)
359 self.print_cmd(self._get_commit_command(git_ref, mark, revobj,
360 file_cmds))
362 # Report progress and checkpoint if it's time for that
363 self.report_progress(ncommits)
364 if (self.checkpoint > 0 and ncommits
365 and ncommits % self.checkpoint == 0):
366 self.note("Exported %i commits - adding checkpoint to output"
367 % ncommits)
368 self._save_marks()
369 self.print_cmd(commands.CheckpointCommand())
371 def _get_name_email(self, user):
372 if user.find('<') == -1:
373 # If the email isn't inside <>, we need to use it as the name
374 # in order for things to round-trip correctly.
375 # (note: parseaddr('a@b.com') => name:'', email: 'a@b.com')
376 name = user
377 email = ''
378 else:
379 name, email = parseaddr(user)
380 return name.encode("utf-8"), email.encode("utf-8")
382 def _get_commit_command(self, git_ref, mark, revobj, file_cmds):
383 # Get the committer and author info
384 committer = revobj.committer
385 name, email = self._get_name_email(committer)
386 committer_info = (name, email, revobj.timestamp, revobj.timezone)
387 if self._multi_author_api_available:
388 more_authors = revobj.get_apparent_authors()
389 author = more_authors.pop(0)
390 else:
391 more_authors = []
392 author = revobj.get_apparent_author()
393 if not self.plain_format and more_authors:
394 name, email = self._get_name_email(author)
395 author_info = (name, email, revobj.timestamp, revobj.timezone)
396 more_author_info = []
397 for a in more_authors:
398 name, email = self._get_name_email(a)
399 more_author_info.append(
400 (name, email, revobj.timestamp, revobj.timezone))
401 elif author != committer:
402 name, email = self._get_name_email(author)
403 author_info = (name, email, revobj.timestamp, revobj.timezone)
404 more_author_info = None
405 else:
406 author_info = None
407 more_author_info = None
409 # Get the parents in terms of marks
410 non_ghost_parents = []
411 for p in revobj.parent_ids:
412 if p in self.excluded_revisions:
413 continue
414 try:
415 parent_mark = self.revid_to_mark[p]
416 non_ghost_parents.append(":%s" % parent_mark)
417 except KeyError:
418 # ghost - ignore
419 continue
420 if non_ghost_parents:
421 from_ = non_ghost_parents[0]
422 merges = non_ghost_parents[1:]
423 else:
424 from_ = None
425 merges = None
427 # Filter the revision properties. Some metadata (like the
428 # author information) is already exposed in other ways so
429 # don't repeat it here.
430 if self.plain_format:
431 properties = None
432 else:
433 properties = revobj.properties
434 for prop in self.properties_to_exclude:
435 try:
436 del properties[prop]
437 except KeyError:
438 pass
440 # Build and return the result
441 return commands.CommitCommand(git_ref, mark, author_info,
442 committer_info, revobj.message.encode("utf-8"), from_, merges, iter(file_cmds),
443 more_authors=more_author_info, properties=properties)
445 def _get_revision_trees(self, parent, revision_id):
446 try:
447 tree_old = self.branch.repository.revision_tree(parent)
448 except bazErrors.UnexpectedInventoryFormat:
449 self.warning("Parent is malformed - diffing against previous parent")
450 # We can't find the old parent. Let's diff against his parent
451 pp = self.branch.repository.get_revision(parent)
452 tree_old = self.branch.repository.revision_tree(pp.parent_ids[0])
453 tree_new = None
454 try:
455 tree_new = self.branch.repository.revision_tree(revision_id)
456 except bazErrors.UnexpectedInventoryFormat:
457 # We can't really do anything anymore
458 self.warning("Revision %s is malformed - skipping" % revision_id)
459 return tree_old, tree_new
461 def _get_filecommands(self, parent, revision_id):
462 """Get the list of FileCommands for the changes between two revisions."""
463 tree_old, tree_new = self._get_revision_trees(parent, revision_id)
464 if not(tree_old and tree_new):
465 # Something is wrong with this revision - ignore the filecommands
466 return []
468 changes = tree_new.changes_from(tree_old)
470 # Make "modified" have 3-tuples, as added does
471 my_modified = [ x[0:3] for x in changes.modified ]
473 # The potential interaction between renames and deletes is messy.
474 # Handle it here ...
475 file_cmds, rd_modifies, renamed = self._process_renames_and_deletes(
476 changes.renamed, changes.removed, revision_id, tree_old)
478 # Map kind changes to a delete followed by an add
479 for path, id_, kind1, kind2 in changes.kind_changed:
480 path = self._adjust_path_for_renames(path, renamed, revision_id)
481 # IGC: I don't understand why a delete is needed here.
482 # In fact, it seems harmful? If you uncomment this line,
483 # please file a bug explaining why you needed to.
484 #file_cmds.append(commands.FileDeleteCommand(path))
485 my_modified.append((path, id_, kind2))
487 # Record modifications
488 for path, id_, kind in changes.added + my_modified + rd_modifies:
489 if kind == 'file':
490 text = tree_new.get_file_text(id_)
491 file_cmds.append(commands.FileModifyCommand(path.encode("utf-8"),
492 helpers.kind_to_mode('file', tree_new.is_executable(id_)),
493 None, text))
494 elif kind == 'symlink':
495 file_cmds.append(commands.FileModifyCommand(path.encode("utf-8"),
496 helpers.kind_to_mode('symlink', False),
497 None, tree_new.get_symlink_target(id_)))
498 elif kind == 'directory':
499 if not self.plain_format:
500 file_cmds.append(commands.FileModifyCommand(path.encode("utf-8"),
501 helpers.kind_to_mode('directory', False),
502 None, None))
503 else:
504 self.warning("cannot export '%s' of kind %s yet - ignoring" %
505 (path, kind))
506 return file_cmds
508 def _process_renames_and_deletes(self, renames, deletes,
509 revision_id, tree_old):
510 file_cmds = []
511 modifies = []
512 renamed = []
514 # See https://bugs.edge.launchpad.net/bzr-fastimport/+bug/268933.
515 # In a nutshell, there are several nasty cases:
517 # 1) bzr rm a; bzr mv b a; bzr commit
518 # 2) bzr mv x/y z; bzr rm x; commmit
520 # The first must come out with the delete first like this:
522 # D a
523 # R b a
525 # The second case must come out with the rename first like this:
527 # R x/y z
528 # D x
530 # So outputting all deletes first or all renames first won't work.
531 # Instead, we need to make multiple passes over the various lists to
532 # get the ordering right.
534 must_be_renamed = {}
535 old_to_new = {}
536 deleted_paths = set([p for p, _, _ in deletes])
537 for (oldpath, newpath, id_, kind,
538 text_modified, meta_modified) in renames:
539 emit = kind != 'directory' or not self.plain_format
540 if newpath in deleted_paths:
541 if emit:
542 file_cmds.append(commands.FileDeleteCommand(newpath.encode("utf-8")))
543 deleted_paths.remove(newpath)
544 if (self.is_empty_dir(tree_old, oldpath)):
545 self.note("Skipping empty dir %s in rev %s" % (oldpath,
546 revision_id))
547 continue
548 #oldpath = self._adjust_path_for_renames(oldpath, renamed,
549 # revision_id)
550 renamed.append([oldpath, newpath])
551 old_to_new[oldpath] = newpath
552 if emit:
553 file_cmds.append(
554 commands.FileRenameCommand(oldpath.encode("utf-8"), newpath.encode("utf-8")))
555 if text_modified or meta_modified:
556 modifies.append((newpath, id_, kind))
558 # Renaming a directory implies all children must be renamed.
559 # Note: changes_from() doesn't handle this
560 if kind == 'directory' and tree_old.kind(id_) == 'directory':
561 for p, e in tree_old.inventory.iter_entries_by_dir(from_dir=id_):
562 if e.kind == 'directory' and self.plain_format:
563 continue
564 old_child_path = osutils.pathjoin(oldpath, p)
565 new_child_path = osutils.pathjoin(newpath, p)
566 must_be_renamed[old_child_path] = new_child_path
568 # Add children not already renamed
569 if must_be_renamed:
570 renamed_already = set(old_to_new.keys())
571 still_to_be_renamed = set(must_be_renamed.keys()) - renamed_already
572 for old_child_path in sorted(still_to_be_renamed):
573 new_child_path = must_be_renamed[old_child_path]
574 if self.verbose:
575 self.note("implicitly renaming %s => %s" % (old_child_path,
576 new_child_path))
577 file_cmds.append(commands.FileRenameCommand(old_child_path.encode("utf-8"),
578 new_child_path.encode("utf-8")))
580 # Record remaining deletes
581 for path, id_, kind in deletes:
582 if path not in deleted_paths:
583 continue
584 if kind == 'directory' and self.plain_format:
585 continue
586 #path = self._adjust_path_for_renames(path, renamed, revision_id)
587 file_cmds.append(commands.FileDeleteCommand(path.encode("utf-8")))
588 return file_cmds, modifies, renamed
590 def _adjust_path_for_renames(self, path, renamed, revision_id):
591 # If a previous rename is found, we should adjust the path
592 for old, new in renamed:
593 if path == old:
594 self.note("Changing path %s given rename to %s in revision %s"
595 % (path, new, revision_id))
596 path = new
597 elif path.startswith(old + '/'):
598 self.note(
599 "Adjusting path %s given rename of %s to %s in revision %s"
600 % (path, old, new, revision_id))
601 path = path.replace(old + "/", new + "/")
602 return path
604 def emit_tags(self):
605 for tag, revid in self.branch.tags.get_tag_dict().items():
606 try:
607 mark = self.revid_to_mark[revid]
608 except KeyError:
609 self.warning('not creating tag %r pointing to non-existent '
610 'revision %s' % (tag, revid))
611 else:
612 git_ref = 'refs/tags/%s' % tag.encode("utf-8")
613 if self.plain_format and not check_ref_format(git_ref):
614 if self.rewrite_tags:
615 new_ref = sanitize_ref_name_for_git(git_ref)
616 self.warning('tag %r is exported as %r to be valid in git.',
617 git_ref, new_ref)
618 git_ref = new_ref
619 else:
620 self.warning('not creating tag %r as its name would not be '
621 'valid in git.', git_ref)
622 continue
623 self.print_cmd(commands.ResetCommand(git_ref, ":" + str(mark)))
625 def _next_tmp_branch_name(self):
626 """Return a unique branch name. The name will start with "tmp"."""
627 prefix = 'tmp'
628 if prefix not in self.branch_names:
629 self.branch_names[prefix] = 0
630 else:
631 self.branch_names[prefix] += 1
632 prefix = '%s.%d' % (prefix, self.branch_names[prefix])
633 return prefix