git_remote_helpers: add fastimport library
[git/dscho.git] / git_remote_helpers / fastimport / parser.py
blobf9c2655913321a307cde939e98bf10d992d2d7a0
1 # Copyright (C) 2008 Canonical Ltd
3 # This program is free software; you can redistribute it and/or modify
4 # it under the terms of the GNU General Public License as published by
5 # the Free Software Foundation; either version 2 of the License, or
6 # (at your option) any later version.
8 # This program is distributed in the hope that it will be useful,
9 # but WITHOUT ANY WARRANTY; without even the implied warranty of
10 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
11 # GNU General Public License for more details.
13 # You should have received a copy of the GNU General Public License
14 # along with this program; if not, write to the Free Software
15 # Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
17 import warnings
19 """Parser of import data into command objects.
21 In order to reuse existing front-ends, the stream format is a subset of
22 the one used by git-fast-import (as of the 1.5.4 release of git at least).
23 The grammar is:
25 stream ::= cmd*;
27 cmd ::= new_blob
28 | new_commit
29 | new_tag
30 | reset_branch
31 | checkpoint
32 | progress
35 new_blob ::= 'blob' lf
36 mark?
37 file_content;
38 file_content ::= data;
40 new_commit ::= 'commit' sp ref_str lf
41 mark?
42 ('author' sp name '<' email '>' when lf)?
43 'committer' sp name '<' email '>' when lf
44 commit_msg
45 ('from' sp (ref_str | hexsha1 | sha1exp_str | idnum) lf)?
46 ('merge' sp (ref_str | hexsha1 | sha1exp_str | idnum) lf)*
47 file_change*
48 lf?;
49 commit_msg ::= data;
51 file_change ::= file_clr
52 | file_del
53 | file_rnm
54 | file_cpy
55 | file_obm
56 | file_inm;
57 file_clr ::= 'deleteall' lf;
58 file_del ::= 'D' sp path_str lf;
59 file_rnm ::= 'R' sp path_str sp path_str lf;
60 file_cpy ::= 'C' sp path_str sp path_str lf;
61 file_obm ::= 'M' sp mode sp (hexsha1 | idnum) sp path_str lf;
62 file_inm ::= 'M' sp mode sp 'inline' sp path_str lf
63 data;
65 new_tag ::= 'tag' sp tag_str lf
66 'from' sp (ref_str | hexsha1 | sha1exp_str | idnum) lf
67 'tagger' sp name '<' email '>' when lf
68 tag_msg;
69 tag_msg ::= data;
71 reset_branch ::= 'reset' sp ref_str lf
72 ('from' sp (ref_str | hexsha1 | sha1exp_str | idnum) lf)?
73 lf?;
75 checkpoint ::= 'checkpoint' lf
76 lf?;
78 progress ::= 'progress' sp not_lf* lf
79 lf?;
81 # note: the first idnum in a stream should be 1 and subsequent
82 # idnums should not have gaps between values as this will cause
83 # the stream parser to reserve space for the gapped values. An
84 # idnum can be updated in the future to a new object by issuing
85 # a new mark directive with the old idnum.
87 mark ::= 'mark' sp idnum lf;
88 data ::= (delimited_data | exact_data)
89 lf?;
91 # note: delim may be any string but must not contain lf.
92 # data_line may contain any data but must not be exactly
93 # delim. The lf after the final data_line is included in
94 # the data.
95 delimited_data ::= 'data' sp '<<' delim lf
96 (data_line lf)*
97 delim lf;
99 # note: declen indicates the length of binary_data in bytes.
100 # declen does not include the lf preceeding the binary data.
102 exact_data ::= 'data' sp declen lf
103 binary_data;
105 # note: quoted strings are C-style quoting supporting \c for
106 # common escapes of 'c' (e..g \n, \t, \\, \") or \nnn where nnn
107 # is the signed byte value in octal. Note that the only
108 # characters which must actually be escaped to protect the
109 # stream formatting is: \, \" and LF. Otherwise these values
110 # are UTF8.
112 ref_str ::= ref;
113 sha1exp_str ::= sha1exp;
114 tag_str ::= tag;
115 path_str ::= path | '"' quoted(path) '"' ;
116 mode ::= '100644' | '644'
117 | '100755' | '755'
118 | '120000'
121 declen ::= # unsigned 32 bit value, ascii base10 notation;
122 bigint ::= # unsigned integer value, ascii base10 notation;
123 binary_data ::= # file content, not interpreted;
125 when ::= raw_when | rfc2822_when;
126 raw_when ::= ts sp tz;
127 rfc2822_when ::= # Valid RFC 2822 date and time;
129 sp ::= # ASCII space character;
130 lf ::= # ASCII newline (LF) character;
132 # note: a colon (':') must precede the numerical value assigned to
133 # an idnum. This is to distinguish it from a ref or tag name as
134 # GIT does not permit ':' in ref or tag strings.
136 idnum ::= ':' bigint;
137 path ::= # GIT style file path, e.g. \"a/b/c\";
138 ref ::= # GIT ref name, e.g. \"refs/heads/MOZ_GECKO_EXPERIMENT\";
139 tag ::= # GIT tag name, e.g. \"FIREFOX_1_5\";
140 sha1exp ::= # Any valid GIT SHA1 expression;
141 hexsha1 ::= # SHA1 in hexadecimal format;
143 # note: name and email are UTF8 strings, however name must not
144 # contain '<' or lf and email must not contain any of the
145 # following: '<', '>', lf.
147 name ::= # valid GIT author/committer name;
148 email ::= # valid GIT author/committer email;
149 ts ::= # time since the epoch in seconds, ascii base10 notation;
150 tz ::= # GIT style timezone;
152 # note: comments may appear anywhere in the input, except
153 # within a data command. Any form of the data command
154 # always escapes the related input from comment processing.
156 # In case it is not clear, the '#' that starts the comment
157 # must be the first character on that the line (an lf have
158 # preceeded it).
160 comment ::= '#' not_lf* lf;
161 not_lf ::= # Any byte that is not ASCII newline (LF);
165 import re
166 import sys
168 from git_remote_helpers.fastimport import (
169 commands,
170 dates,
171 errors
175 ## Stream parsing ##
177 class LineBasedParser(object):
179 def __init__(self, input, filename=None):
180 """A Parser that keeps track of line numbers.
182 :param input: the file-like object to read from
184 self.input = input
185 if filename is None:
186 try:
187 self.filename = input.name
188 except AttributeError:
189 self.filename = "(unknown)"
190 else:
191 self.filename = filename
192 self.lineno = 0
193 # Lines pushed back onto the input stream
194 self._buffer = []
196 def abort(self, exception, *args):
197 """Raise an exception providing line number information."""
198 raise exception(self.filename, self.lineno, *args)
200 def readline(self):
201 """Get the next line including the newline or '' on EOF."""
202 self.lineno += 1
203 if self._buffer:
204 return self._buffer.pop()
205 else:
206 return self.input.readline()
208 def next_line(self):
209 """Get the next line without the newline or None on EOF."""
210 line = self.readline()
211 if line:
212 return line[:-1]
213 else:
214 return None
216 def push_line(self, line):
217 """Push line back onto the line buffer.
219 :param line: the line with no trailing newline
221 self.lineno -= 1
222 self._buffer.append(line + "\n")
224 def read_bytes(self, count):
225 """Read a given number of bytes from the input stream.
227 Throws MissingBytes if the bytes are not found.
229 Note: This method does not read from the line buffer.
231 :return: a string
233 result = self.input.read(count)
234 found = len(result)
235 self.lineno += result.count("\n")
236 if found != count:
237 self.abort(errors.MissingBytes, count, found)
238 return result
240 def read_until(self, terminator):
241 """Read the input stream until the terminator is found.
243 Throws MissingTerminator if the terminator is not found.
245 Note: This method does not read from the line buffer.
247 :return: the bytes read up to but excluding the terminator.
250 lines = []
251 term = terminator + '\n'
252 while True:
253 line = self.input.readline()
254 if line == term:
255 break
256 else:
257 lines.append(line)
258 return ''.join(lines)
261 # Regular expression used for parsing. (Note: The spec states that the name
262 # part should be non-empty but git-fast-export doesn't always do that so
263 # the first bit is \w*, not \w+.) Also git-fast-import code says the
264 # space before the email is optional.
265 _WHO_AND_WHEN_RE = re.compile(r'([^<]*)<(.*)> (.+)')
266 _WHO_RE = re.compile(r'([^<]*)<(.*)>')
269 class ImportParser(LineBasedParser):
271 def __init__(self, input, filename=None):
272 """A Parser of import commands.
274 :param input: the file-like object to read from
275 :param verbose: display extra information of not
277 LineBasedParser.__init__(self, input, filename)
279 # We auto-detect the date format when a date is first encountered
280 self.date_parser = None
282 def warning(self, msg):
283 sys.stderr.write("warning line %d: %s\n" % (self.lineno, msg))
285 def parse(self):
286 """Parse the input stream, yielding a sequence of ImportCommand
287 objects. Iteration terminates on EOF. Raises InvalidCommand on
288 parse error."""
289 while True:
290 line = self.next_line()
291 if line is None:
292 break
293 elif len(line) == 0 or line.startswith('#'):
294 continue
295 # Search for commands in order of likelihood
296 elif line.startswith('commit '):
297 yield self._parse_commit(line[len('commit '):])
298 elif line.startswith('blob'):
299 yield self._parse_blob()
300 elif line.startswith('done'):
301 break
302 elif line.startswith('progress '):
303 yield commands.ProgressCommand(line[len('progress '):])
304 elif line.startswith('reset '):
305 yield self._parse_reset(line[len('reset '):])
306 elif line.startswith('tag '):
307 yield self._parse_tag(line[len('tag '):])
308 elif line.startswith('checkpoint'):
309 yield commands.CheckpointCommand()
310 elif line.startswith('feature'):
311 yield self._parse_feature(line[len('feature '):])
312 else:
313 self.abort(errors.InvalidCommand, line)
315 def iter_commands(self):
316 warnings.warn("iter_commands() deprecated: use parse()",
317 DeprecationWarning, stacklevel=2)
318 return self.parse()
320 def iter_file_commands(self):
321 """Iterator returning FileCommand objects.
323 If an invalid file command is found, the line is silently
324 pushed back and iteration ends.
326 while True:
327 line = self.next_line()
328 if line is None:
329 break
330 elif len(line) == 0 or line.startswith('#'):
331 continue
332 # Search for file commands in order of likelihood
333 elif line.startswith('M '):
334 yield self._parse_file_modify(line[2:])
335 elif line.startswith('D '):
336 path = self._path(line[2:])
337 yield commands.FileDeleteCommand(path)
338 elif line.startswith('R '):
339 old, new = self._path_pair(line[2:])
340 yield commands.FileRenameCommand(old, new)
341 elif line.startswith('C '):
342 src, dest = self._path_pair(line[2:])
343 yield commands.FileCopyCommand(src, dest)
344 elif line.startswith('deleteall'):
345 yield commands.FileDeleteAllCommand()
346 else:
347 self.push_line(line)
348 break
350 def _parse_blob(self):
351 """Parse a blob command."""
352 location = (self.filename, self.lineno)
353 mark = self._get_mark_if_any()
354 data = self._get_data('blob')
355 return commands.BlobCommand(mark, data, location)
357 def _parse_commit(self, ref):
358 """Parse a commit command."""
359 location = (self.filename, self.lineno)
360 mark = self._get_mark_if_any()
361 author = self._get_user_info('commit', 'author', False)
362 more_authors = []
363 while True:
364 another_author = self._get_user_info('commit', 'author', False)
365 if another_author is not None:
366 more_authors.append(another_author)
367 else:
368 break
369 committer = self._get_user_info('commit', 'committer')
370 message = self._get_data('commit', 'message')
371 try:
372 message = message.decode('utf_8')
373 except UnicodeDecodeError:
374 self.warning(
375 "commit message not in utf8 - replacing unknown characters")
376 message = message.decode('utf_8', 'replace')
377 from_ = self._get_from()
378 merges = []
379 while True:
380 merge = self._get_merge()
381 if merge is not None:
382 # while the spec suggests it's illegal, git-fast-export
383 # outputs multiple merges on the one line, e.g.
384 # merge :x :y :z
385 these_merges = merge.split(" ")
386 merges.extend(these_merges)
387 else:
388 break
389 properties = {}
390 while True:
391 name_value = self._get_property()
392 if name_value is not None:
393 name, value = name_value
394 properties[name] = value
395 else:
396 break
397 file_cmds = list(self.iter_file_commands())
398 return commands.CommitCommand(ref, mark, author, committer, message,
399 from_, merges, file_cmds, location,
400 more_authors=more_authors, properties=properties)
402 def _parse_feature(self, info):
403 """Parse a feature command."""
404 parts = info.split("=", 1)
405 name = parts[0]
406 if len(parts) > 1:
407 value = self._path(parts[1])
408 else:
409 value = None
410 location = (self.filename, self.lineno)
411 return commands.FeatureCommand(name, value, location=location)
414 def _parse_file_modify(self, info):
415 """Parse a filemodify command within a commit.
417 :param info: a string in the format "mode dataref path"
418 (where dataref might be the hard-coded literal 'inline').
420 params = info.split(' ', 2)
421 path = self._path(params[2])
422 mode = params[0]
423 if params[1] == 'inline':
424 dataref = None
425 data = self._get_data('filemodify')
426 else:
427 dataref = params[1]
428 data = None
429 return commands.FileModifyCommand(path, mode, dataref, data)
431 def _parse_reset(self, ref):
432 """Parse a reset command."""
433 from_ = self._get_from()
434 return commands.ResetCommand(ref, from_)
436 def _parse_tag(self, name):
437 """Parse a tag command."""
438 from_ = self._get_from('tag')
439 tagger = self._get_user_info('tag', 'tagger', accept_just_who=True)
440 message = self._get_data('tag', 'message').decode('utf_8')
441 return commands.TagCommand(name, from_, tagger, message)
443 def _get_mark_if_any(self):
444 """Parse a mark section."""
445 line = self.next_line()
446 if line.startswith('mark :'):
447 return line[len('mark :'):]
448 else:
449 self.push_line(line)
450 return None
452 def _get_from(self, required_for=None):
453 """Parse a from section."""
454 line = self.next_line()
455 if line is None:
456 return None
457 elif line.startswith('from '):
458 return line[len('from '):]
459 elif required_for:
460 self.abort(errors.MissingSection, required_for, 'from')
461 else:
462 self.push_line(line)
463 return None
465 def _get_merge(self):
466 """Parse a merge section."""
467 line = self.next_line()
468 if line is None:
469 return None
470 elif line.startswith('merge '):
471 return line[len('merge '):]
472 else:
473 self.push_line(line)
474 return None
476 def _get_property(self):
477 """Parse a property section."""
478 line = self.next_line()
479 if line is None:
480 return None
481 elif line.startswith('property '):
482 return self._name_value(line[len('property '):])
483 else:
484 self.push_line(line)
485 return None
487 def _get_user_info(self, cmd, section, required=True,
488 accept_just_who=False):
489 """Parse a user section."""
490 line = self.next_line()
491 if line.startswith(section + ' '):
492 return self._who_when(line[len(section + ' '):], cmd, section,
493 accept_just_who=accept_just_who)
494 elif required:
495 self.abort(errors.MissingSection, cmd, section)
496 else:
497 self.push_line(line)
498 return None
500 def _get_data(self, required_for, section='data'):
501 """Parse a data section."""
502 line = self.next_line()
503 if line.startswith('data '):
504 rest = line[len('data '):]
505 if rest.startswith('<<'):
506 return self.read_until(rest[2:])
507 else:
508 size = int(rest)
509 read_bytes = self.read_bytes(size)
510 # optional LF after data.
511 next = self.input.readline()
512 self.lineno += 1
513 if len(next) > 1 or next != "\n":
514 self.push_line(next[:-1])
515 return read_bytes
516 else:
517 self.abort(errors.MissingSection, required_for, section)
519 def _who_when(self, s, cmd, section, accept_just_who=False):
520 """Parse who and when information from a string.
522 :return: a tuple of (name,email,timestamp,timezone). name may be
523 the empty string if only an email address was given.
525 match = _WHO_AND_WHEN_RE.search(s)
526 if match:
527 datestr = match.group(3)
528 if self.date_parser is None:
529 # auto-detect the date format
530 if len(datestr.split(' ')) == 2:
531 format = 'raw'
532 elif datestr == 'now':
533 format = 'now'
534 else:
535 format = 'rfc2822'
536 self.date_parser = dates.DATE_PARSERS_BY_NAME[format]
537 when = self.date_parser(datestr, self.lineno)
538 else:
539 match = _WHO_RE.search(s)
540 if accept_just_who and match:
541 # HACK around missing time
542 # TODO: output a warning here
543 when = dates.DATE_PARSERS_BY_NAME['now']('now')
544 else:
545 self.abort(errors.BadFormat, cmd, section, s)
547 # Do not attempt to decode name or email address; they are just
548 # bytes. (Everything will work out better if they are in UTF-8,
549 # but that's not guaranteed.)
550 name = match.group(1).rstrip()
551 email = match.group(2)
552 return (name, email, when[0], when[1])
554 def _name_value(self, s):
555 """Parse a (name,value) tuple from 'name value-length value'."""
556 parts = s.split(' ', 2)
557 name = parts[0]
558 if len(parts) == 1:
559 value = None
560 else:
561 size = int(parts[1])
562 value = parts[2]
563 still_to_read = size - len(value)
564 if still_to_read == 1:
565 value += "\n"
566 elif still_to_read > 0:
567 read_bytes = self.read_bytes(still_to_read - 1)
568 value += "\n" + read_bytes
569 value = value.decode('utf8')
570 return (name, value)
572 def _path(self, s):
573 """Parse a path."""
574 if s.startswith('"'):
575 if s[-1] != '"':
576 self.abort(errors.BadFormat, '?', '?', s)
577 else:
578 return _unquote_c_string(s[1:-1])
580 # Do *not* decode the path to a Unicode string: filenames on
581 # Unix are just bytes. Git and Mercurial, at least, inherit
582 # this stance. git-fast-import(1) merely says "It is
583 # recommended that <path> always be encoded using UTF-8.", which
584 # is good advice ... but not something we can count on here.
585 return s
587 def _path_pair(self, s):
588 """Parse two paths separated by a space."""
589 # TODO: handle a space in the first path
590 if s.startswith('"'):
591 parts = s[1:].split('" ', 1)
592 else:
593 parts = s.split(' ', 1)
594 if len(parts) != 2:
595 self.abort(errors.BadFormat, '?', '?', s)
596 elif parts[1].startswith('"') and parts[1].endswith('"'):
597 parts[1] = parts[1][1:-1]
598 elif parts[1].startswith('"') or parts[1].endswith('"'):
599 self.abort(errors.BadFormat, '?', '?', s)
600 return map(_unquote_c_string, parts)
602 def _mode(self, s):
603 """Parse a file mode into executable and symlink flags.
605 :return (is_executable, is_symlink)
607 # Note: Output from git-fast-export slightly different to spec
608 if s in ['644', '100644', '0100644']:
609 return False, False
610 elif s in ['755', '100755', '0100755']:
611 return True, False
612 elif s in ['120000', '0120000']:
613 return False, True
614 else:
615 self.abort(errors.BadFormat, 'filemodify', 'mode', s)
618 def _unquote_c_string(s):
619 """replace C-style escape sequences (\n, \", etc.) with real chars."""
620 # HACK: Python strings are close enough
621 return s.decode('string_escape', 'replace')