Move pure-fastimport code into its own directory, in preparation of splitting it...
[bzr-fastimport.git] / fastimport / parser.py
blobab6efb62472367d98f747a5811c924ba931b7d23
1 # Copyright (C) 2008 Canonical Ltd
3 # This program is free software; you can redistribute it and/or modify
4 # it under the terms of the GNU General Public License as published by
5 # the Free Software Foundation; either version 2 of the License, or
6 # (at your option) any later version.
8 # This program is distributed in the hope that it will be useful,
9 # but WITHOUT ANY WARRANTY; without even the implied warranty of
10 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
11 # GNU General Public License for more details.
13 # You should have received a copy of the GNU General Public License
14 # along with this program; if not, write to the Free Software
15 # Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
17 """Parser of import data into command objects.
19 In order to reuse existing front-ends, the stream format is a subset of
20 the one used by git-fast-import (as of the 1.5.4 release of git at least).
21 The grammar is:
23 stream ::= cmd*;
25 cmd ::= new_blob
26 | new_commit
27 | new_tag
28 | reset_branch
29 | checkpoint
30 | progress
33 new_blob ::= 'blob' lf
34 mark?
35 file_content;
36 file_content ::= data;
38 new_commit ::= 'commit' sp ref_str lf
39 mark?
40 ('author' sp name '<' email '>' when lf)?
41 'committer' sp name '<' email '>' when lf
42 commit_msg
43 ('from' sp (ref_str | hexsha1 | sha1exp_str | idnum) lf)?
44 ('merge' sp (ref_str | hexsha1 | sha1exp_str | idnum) lf)*
45 file_change*
46 lf?;
47 commit_msg ::= data;
49 file_change ::= file_clr
50 | file_del
51 | file_rnm
52 | file_cpy
53 | file_obm
54 | file_inm;
55 file_clr ::= 'deleteall' lf;
56 file_del ::= 'D' sp path_str lf;
57 file_rnm ::= 'R' sp path_str sp path_str lf;
58 file_cpy ::= 'C' sp path_str sp path_str lf;
59 file_obm ::= 'M' sp mode sp (hexsha1 | idnum) sp path_str lf;
60 file_inm ::= 'M' sp mode sp 'inline' sp path_str lf
61 data;
63 new_tag ::= 'tag' sp tag_str lf
64 'from' sp (ref_str | hexsha1 | sha1exp_str | idnum) lf
65 'tagger' sp name '<' email '>' when lf
66 tag_msg;
67 tag_msg ::= data;
69 reset_branch ::= 'reset' sp ref_str lf
70 ('from' sp (ref_str | hexsha1 | sha1exp_str | idnum) lf)?
71 lf?;
73 checkpoint ::= 'checkpoint' lf
74 lf?;
76 progress ::= 'progress' sp not_lf* lf
77 lf?;
79 # note: the first idnum in a stream should be 1 and subsequent
80 # idnums should not have gaps between values as this will cause
81 # the stream parser to reserve space for the gapped values. An
82 # idnum can be updated in the future to a new object by issuing
83 # a new mark directive with the old idnum.
85 mark ::= 'mark' sp idnum lf;
86 data ::= (delimited_data | exact_data)
87 lf?;
89 # note: delim may be any string but must not contain lf.
90 # data_line may contain any data but must not be exactly
91 # delim. The lf after the final data_line is included in
92 # the data.
93 delimited_data ::= 'data' sp '<<' delim lf
94 (data_line lf)*
95 delim lf;
97 # note: declen indicates the length of binary_data in bytes.
98 # declen does not include the lf preceeding the binary data.
100 exact_data ::= 'data' sp declen lf
101 binary_data;
103 # note: quoted strings are C-style quoting supporting \c for
104 # common escapes of 'c' (e..g \n, \t, \\, \") or \nnn where nnn
105 # is the signed byte value in octal. Note that the only
106 # characters which must actually be escaped to protect the
107 # stream formatting is: \, " and LF. Otherwise these values
108 # are UTF8.
110 ref_str ::= ref;
111 sha1exp_str ::= sha1exp;
112 tag_str ::= tag;
113 path_str ::= path | '"' quoted(path) '"' ;
114 mode ::= '100644' | '644'
115 | '100755' | '755'
116 | '120000'
119 declen ::= # unsigned 32 bit value, ascii base10 notation;
120 bigint ::= # unsigned integer value, ascii base10 notation;
121 binary_data ::= # file content, not interpreted;
123 when ::= raw_when | rfc2822_when;
124 raw_when ::= ts sp tz;
125 rfc2822_when ::= # Valid RFC 2822 date and time;
127 sp ::= # ASCII space character;
128 lf ::= # ASCII newline (LF) character;
130 # note: a colon (':') must precede the numerical value assigned to
131 # an idnum. This is to distinguish it from a ref or tag name as
132 # GIT does not permit ':' in ref or tag strings.
134 idnum ::= ':' bigint;
135 path ::= # GIT style file path, e.g. "a/b/c";
136 ref ::= # GIT ref name, e.g. "refs/heads/MOZ_GECKO_EXPERIMENT";
137 tag ::= # GIT tag name, e.g. "FIREFOX_1_5";
138 sha1exp ::= # Any valid GIT SHA1 expression;
139 hexsha1 ::= # SHA1 in hexadecimal format;
141 # note: name and email are UTF8 strings, however name must not
142 # contain '<' or lf and email must not contain any of the
143 # following: '<', '>', lf.
145 name ::= # valid GIT author/committer name;
146 email ::= # valid GIT author/committer email;
147 ts ::= # time since the epoch in seconds, ascii base10 notation;
148 tz ::= # GIT style timezone;
150 # note: comments may appear anywhere in the input, except
151 # within a data command. Any form of the data command
152 # always escapes the related input from comment processing.
154 # In case it is not clear, the '#' that starts the comment
155 # must be the first character on that the line (an lf have
156 # preceeded it).
158 comment ::= '#' not_lf* lf;
159 not_lf ::= # Any byte that is not ASCII newline (LF);
163 import re
164 import sys
166 import commands
167 import dates
168 import errors
171 ## Stream parsing ##
173 class LineBasedParser(object):
175 def __init__(self, input):
176 """A Parser that keeps track of line numbers.
178 :param input: the file-like object to read from
180 self.input = input
181 self.lineno = 0
182 # Lines pushed back onto the input stream
183 self._buffer = []
185 def abort(self, exception, *args):
186 """Raise an exception providing line number information."""
187 raise exception(self.lineno, *args)
189 def readline(self):
190 """Get the next line including the newline or '' on EOF."""
191 self.lineno += 1
192 if self._buffer:
193 return self._buffer.pop()
194 else:
195 return self.input.readline()
197 def next_line(self):
198 """Get the next line without the newline or None on EOF."""
199 line = self.readline()
200 if line:
201 return line[:-1]
202 else:
203 return None
205 def push_line(self, line):
206 """Push line back onto the line buffer.
208 :param line: the line with no trailing newline
210 self.lineno -= 1
211 self._buffer.append(line + "\n")
213 def read_bytes(self, count):
214 """Read a given number of bytes from the input stream.
216 Throws MissingBytes if the bytes are not found.
218 Note: This method does not read from the line buffer.
220 :return: a string
222 result = self.input.read(count)
223 found = len(result)
224 self.lineno += result.count("\n")
225 if found != count:
226 self.abort(errors.MissingBytes, count, found)
227 return result
229 def read_until(self, terminator):
230 """Read the input stream until the terminator is found.
232 Throws MissingTerminator if the terminator is not found.
234 Note: This method does not read from the line buffer.
236 :return: the bytes read up to but excluding the terminator.
239 lines = []
240 term = terminator + '\n'
241 while True:
242 line = self.input.readline()
243 if line == term:
244 break
245 else:
246 lines.append(line)
247 return ''.join(lines)
250 # Regular expression used for parsing. (Note: The spec states that the name
251 # part should be non-empty but git-fast-export doesn't always do that so
252 # the first bit is \w*, not \w+.) Also git-fast-import code says the
253 # space before the email is optional.
254 _WHO_AND_WHEN_RE = re.compile(r'([^<]*)<(.*)> (.+)')
255 _WHO_RE = re.compile(r'([^<]*)<(.*)>')
258 class ImportParser(LineBasedParser):
260 def __init__(self, input, verbose=False, output=sys.stdout,
261 user_mapper=None):
262 """A Parser of import commands.
264 :param input: the file-like object to read from
265 :param verbose: display extra information of not
266 :param output: the file-like object to write messages to (YAGNI?)
267 :param user_mapper: if not None, the UserMapper used to adjust
268 user-ids for authors, committers and taggers.
270 LineBasedParser.__init__(self, input)
271 self.verbose = verbose
272 self.output = output
273 self.user_mapper = user_mapper
274 # We auto-detect the date format when a date is first encountered
275 self.date_parser = None
277 def warning(self, msg):
278 sys.stderr.write("warning line %d: %s\n" % (self.lineno, msg))
280 def iter_commands(self):
281 """Iterator returning ImportCommand objects."""
282 while True:
283 line = self.next_line()
284 if line is None:
285 break
286 elif len(line) == 0 or line.startswith('#'):
287 continue
288 # Search for commands in order of likelihood
289 elif line.startswith('commit '):
290 yield self._parse_commit(line[len('commit '):])
291 elif line.startswith('blob'):
292 yield self._parse_blob()
293 elif line.startswith('progress '):
294 yield commands.ProgressCommand(line[len('progress '):])
295 elif line.startswith('reset '):
296 yield self._parse_reset(line[len('reset '):])
297 elif line.startswith('tag '):
298 yield self._parse_tag(line[len('tag '):])
299 elif line.startswith('checkpoint'):
300 yield commands.CheckpointCommand()
301 elif line.startswith('feature'):
302 yield self._parse_feature(line[len('feature '):])
303 else:
304 self.abort(errors.InvalidCommand, line)
306 def iter_file_commands(self):
307 """Iterator returning FileCommand objects.
309 If an invalid file command is found, the line is silently
310 pushed back and iteration ends.
312 while True:
313 line = self.next_line()
314 if line is None:
315 break
316 elif len(line) == 0 or line.startswith('#'):
317 continue
318 # Search for file commands in order of likelihood
319 elif line.startswith('M '):
320 yield self._parse_file_modify(line[2:])
321 elif line.startswith('D '):
322 path = self._path(line[2:])
323 yield commands.FileDeleteCommand(path)
324 elif line.startswith('R '):
325 old, new = self._path_pair(line[2:])
326 yield commands.FileRenameCommand(old, new)
327 elif line.startswith('C '):
328 src, dest = self._path_pair(line[2:])
329 yield commands.FileCopyCommand(src, dest)
330 elif line.startswith('deleteall'):
331 yield commands.FileDeleteAllCommand()
332 else:
333 self.push_line(line)
334 break
336 def _parse_blob(self):
337 """Parse a blob command."""
338 lineno = self.lineno
339 mark = self._get_mark_if_any()
340 data = self._get_data('blob')
341 return commands.BlobCommand(mark, data, lineno)
343 def _parse_commit(self, ref):
344 """Parse a commit command."""
345 lineno = self.lineno
346 mark = self._get_mark_if_any()
347 author = self._get_user_info('commit', 'author', False)
348 more_authors = []
349 while True:
350 another_author = self._get_user_info('commit', 'author', False)
351 if another_author is not None:
352 more_authors.append(another_author)
353 else:
354 break
355 committer = self._get_user_info('commit', 'committer')
356 message = self._get_data('commit', 'message')
357 try:
358 message = message.decode('utf_8')
359 except UnicodeDecodeError:
360 self.warning(
361 "commit message not in utf8 - replacing unknown characters")
362 message = message.decode('utf_8', 'replace')
363 from_ = self._get_from()
364 merges = []
365 while True:
366 merge = self._get_merge()
367 if merge is not None:
368 # while the spec suggests it's illegal, git-fast-export
369 # outputs multiple merges on the one line, e.g.
370 # merge :x :y :z
371 these_merges = merge.split(" ")
372 merges.extend(these_merges)
373 else:
374 break
375 properties = {}
376 while True:
377 name_value = self._get_property()
378 if name_value is not None:
379 name, value = name_value
380 properties[name] = value
381 else:
382 break
383 return commands.CommitCommand(ref, mark, author, committer, message,
384 from_, merges, self.iter_file_commands, lineno=lineno,
385 more_authors=more_authors, properties=properties)
387 def _parse_feature(self, info):
388 """Parse a feature command."""
389 parts = info.split("=", 1)
390 name = parts[0]
391 if len(parts) > 1:
392 value = self._path(parts[1])
393 else:
394 value = None
395 return commands.FeatureCommand(name, value, lineno=self.lineno)
397 def _parse_file_modify(self, info):
398 """Parse a filemodify command within a commit.
400 :param info: a string in the format "mode dataref path"
401 (where dataref might be the hard-coded literal 'inline').
403 params = info.split(' ', 2)
404 path = self._path(params[2])
405 is_executable, kind = self._mode(params[0])
406 if params[1] == 'inline':
407 dataref = None
408 data = self._get_data('filemodify')
409 else:
410 dataref = params[1]
411 data = None
412 return commands.FileModifyCommand(path, kind, is_executable, dataref,
413 data)
415 def _parse_reset(self, ref):
416 """Parse a reset command."""
417 from_ = self._get_from()
418 return commands.ResetCommand(ref, from_)
420 def _parse_tag(self, name):
421 """Parse a tag command."""
422 from_ = self._get_from('tag')
423 tagger = self._get_user_info('tag', 'tagger', accept_just_who=True)
424 message = self._get_data('tag', 'message').decode('utf_8')
425 return commands.TagCommand(name, from_, tagger, message)
427 def _get_mark_if_any(self):
428 """Parse a mark section."""
429 line = self.next_line()
430 if line.startswith('mark :'):
431 return line[len('mark :'):]
432 else:
433 self.push_line(line)
434 return None
436 def _get_from(self, required_for=None):
437 """Parse a from section."""
438 line = self.next_line()
439 if line is None:
440 return None
441 elif line.startswith('from '):
442 return line[len('from '):]
443 elif required_for:
444 self.abort(errors.MissingSection, required_for, 'from')
445 else:
446 self.push_line(line)
447 return None
449 def _get_merge(self):
450 """Parse a merge section."""
451 line = self.next_line()
452 if line is None:
453 return None
454 elif line.startswith('merge '):
455 return line[len('merge '):]
456 else:
457 self.push_line(line)
458 return None
460 def _get_property(self):
461 """Parse a property section."""
462 line = self.next_line()
463 if line is None:
464 return None
465 elif line.startswith('property '):
466 return self._name_value(line[len('property '):])
467 else:
468 self.push_line(line)
469 return None
471 def _get_user_info(self, cmd, section, required=True,
472 accept_just_who=False):
473 """Parse a user section."""
474 line = self.next_line()
475 if line.startswith(section + ' '):
476 return self._who_when(line[len(section + ' '):], cmd, section,
477 accept_just_who=accept_just_who)
478 elif required:
479 self.abort(errors.MissingSection, cmd, section)
480 else:
481 self.push_line(line)
482 return None
484 def _get_data(self, required_for, section='data'):
485 """Parse a data section."""
486 line = self.next_line()
487 if line.startswith('data '):
488 rest = line[len('data '):]
489 if rest.startswith('<<'):
490 return self.read_until(rest[2:])
491 else:
492 size = int(rest)
493 read_bytes = self.read_bytes(size)
494 # optional LF after data.
495 next = self.input.readline()
496 self.lineno += 1
497 if len(next) > 1 or next != "\n":
498 self.push_line(next[:-1])
499 return read_bytes
500 else:
501 self.abort(errors.MissingSection, required_for, section)
503 def _who_when(self, s, cmd, section, accept_just_who=False):
504 """Parse who and when information from a string.
506 :return: a tuple of (name,email,timestamp,timezone). name may be
507 the empty string if only an email address was given.
509 match = _WHO_AND_WHEN_RE.search(s)
510 if match:
511 datestr = match.group(3).lstrip()
512 if self.date_parser is None:
513 # auto-detect the date format
514 if len(datestr.split(' ')) == 2:
515 format = 'raw'
516 elif datestr == 'now':
517 format = 'now'
518 else:
519 format = 'rfc2822'
520 self.date_parser = dates.DATE_PARSERS_BY_NAME[format]
521 try:
522 when = self.date_parser(datestr, self.lineno)
523 except ValueError:
524 print "failed to parse datestr '%s'" % (datestr,)
525 raise
526 else:
527 match = _WHO_RE.search(s)
528 if accept_just_who and match:
529 # HACK around missing time
530 # TODO: output a warning here
531 when = dates.DATE_PARSERS_BY_NAME['now']('now')
532 else:
533 self.abort(errors.BadFormat, cmd, section, s)
534 name = match.group(1)
535 if len(name) > 0:
536 if name[-1] == " ":
537 try:
538 name = name[:-1].decode('utf_8')
539 except UnicodeDecodeError:
540 # The spec says names are *typically* utf8 encoded
541 # but that isn't enforced by git-fast-export (at least)
542 self.warning("%s name not in utf8 - replacing unknown "
543 "characters" % (section,))
544 name = name[:-1].decode('utf_8', 'replace')
545 email = match.group(2)
546 # While it shouldn't happen, some datasets have email addresses
547 # which contain unicode characters. See bug 338186. We sanitize
548 # the data at this level just in case.
549 try:
550 email = email.decode('utf_8')
551 except UnicodeDecodeError:
552 self.warning("%s email not in utf8 - replacing unknown characters"
553 % (section,))
554 email = email.decode('utf_8', 'replace')
555 if self.user_mapper:
556 name, email = self.user_mapper.map_name_and_email(name, email)
557 return (name, email, when[0], when[1])
559 def _name_value(self, s):
560 """Parse a (name,value) tuple from 'name value-length value'."""
561 parts = s.split(' ', 2)
562 name = parts[0]
563 if len(parts) == 1:
564 value = None
565 else:
566 size = int(parts[1])
567 value = parts[2]
568 still_to_read = size - len(value)
569 if still_to_read > 0:
570 read_bytes = self.read_bytes(still_to_read)
571 value += "\n" + read_bytes[:still_to_read - 1]
572 value = value.decode('utf8')
573 return (name, value)
575 def _path(self, s):
576 """Parse a path."""
577 if s.startswith('"'):
578 if s[-1] != '"':
579 self.abort(errors.BadFormat, '?', '?', s)
580 else:
581 return _unquote_c_string(s[1:-1])
582 try:
583 return s.decode('utf_8')
584 except UnicodeDecodeError:
585 # The spec recommends utf8 encoding but that isn't enforced
586 return s
588 def _path_pair(self, s):
589 """Parse two paths separated by a space."""
590 # TODO: handle a space in the first path
591 if s.startswith('"'):
592 parts = s[1:].split('" ', 1)
593 else:
594 parts = s.split(' ', 1)
595 if len(parts) != 2:
596 self.abort(errors.BadFormat, '?', '?', s)
597 elif parts[1].startswith('"') and parts[1].endswith('"'):
598 parts[1] = parts[1][1:-1]
599 elif parts[1].startswith('"') or parts[1].endswith('"'):
600 self.abort(errors.BadFormat, '?', '?', s)
601 return map(_unquote_c_string, parts)
603 def _mode(self, s):
604 """Parse a file mode into executable and kind.
606 :return (is_executable, kind)
608 # Note: Output from git-fast-export slightly different to spec
609 if s in ['644', '100644', '0100644']:
610 return False, commands.FILE_KIND
611 elif s in ['755', '100755', '0100755']:
612 return True, commands.FILE_KIND
613 elif s in ['040000', '0040000']:
614 return False, commands.DIRECTORY_KIND
615 elif s in ['120000', '0120000']:
616 return False, commands.SYMLINK_KIND
617 elif s in ['160000', '0160000']:
618 return False, commands.TREE_REFERENCE_KIND
619 else:
620 self.abort(errors.BadFormat, 'filemodify', 'mode', s)
623 def _unquote_c_string(s):
624 """replace C-style escape sequences (\n, \", etc.) with real chars."""
625 # HACK: Python strings are close enough
626 return s.decode('string_escape', 'replace')