git_remote_helpers/fastimport/parser.py

   1 # Copyright (C) 2008 Canonical Ltd
   2 #
   3 # This program is free software; you can redistribute it and/or modify
   4 # it under the terms of the GNU General Public License as published by
   5 # the Free Software Foundation; either version 2 of the License, or
   6 # (at your option) any later version.
   7 #
   8 # This program is distributed in the hope that it will be useful,
   9 # but WITHOUT ANY WARRANTY; without even the implied warranty of
  10 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  11 # GNU General Public License for more details.
  12 #
  13 # You should have received a copy of the GNU General Public License
  14 # along with this program; if not, write to the Free Software
  15 # Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
  16
  17 import warnings
  18
  19 """Parser of import data into command objects.
  20
  21 In order to reuse existing front-ends, the stream format is a subset of
  22 the one used by git-fast-import (as of the 1.5.4 release of git at least).
  23 The grammar is:
  24
  25   stream ::= cmd*;
  26
  27   cmd ::= new_blob
  28         | new_commit
  29         | new_tag
  30         | reset_branch
  31         | checkpoint
  32         | progress
  33         ;
  34
  35   new_blob ::= 'blob' lf
  36     mark?
  37     file_content;
  38   file_content ::= data;
  39
  40   new_commit ::= 'commit' sp ref_str lf
  41     mark?
  42     ('author' sp name '<' email '>' when lf)?
  43     'committer' sp name '<' email '>' when lf
  44     commit_msg
  45     ('from' sp (ref_str | hexsha1 | sha1exp_str | idnum) lf)?
  46     ('merge' sp (ref_str | hexsha1 | sha1exp_str | idnum) lf)*
  47     file_change*
  48     lf?;
  49   commit_msg ::= data;
  50
  51   file_change ::= file_clr
  52     | file_del
  53     | file_rnm
  54     | file_cpy
  55     | file_obm
  56     | file_inm;
  57   file_clr ::= 'deleteall' lf;
  58   file_del ::= 'D' sp path_str lf;
  59   file_rnm ::= 'R' sp path_str sp path_str lf;
  60   file_cpy ::= 'C' sp path_str sp path_str lf;
  61   file_obm ::= 'M' sp mode sp (hexsha1 | idnum) sp path_str lf;
  62   file_inm ::= 'M' sp mode sp 'inline' sp path_str lf
  63     data;
  64
  65   new_tag ::= 'tag' sp tag_str lf
  66     'from' sp (ref_str | hexsha1 | sha1exp_str | idnum) lf
  67     'tagger' sp name '<' email '>' when lf
  68     tag_msg;
  69   tag_msg ::= data;
  70
  71   reset_branch ::= 'reset' sp ref_str lf
  72     ('from' sp (ref_str | hexsha1 | sha1exp_str | idnum) lf)?
  73     lf?;
  74
  75   checkpoint ::= 'checkpoint' lf
  76     lf?;
  77
  78   progress ::= 'progress' sp not_lf* lf
  79     lf?;
  80
  81      # note: the first idnum in a stream should be 1 and subsequent
  82      # idnums should not have gaps between values as this will cause
  83      # the stream parser to reserve space for the gapped values.  An
  84      # idnum can be updated in the future to a new object by issuing
  85      # a new mark directive with the old idnum.
  86      #
  87   mark ::= 'mark' sp idnum lf;
  88   data ::= (delimited_data | exact_data)
  89     lf?;
  90
  91     # note: delim may be any string but must not contain lf.
  92     # data_line may contain any data but must not be exactly
  93     # delim. The lf after the final data_line is included in
  94     # the data.
  95   delimited_data ::= 'data' sp '<<' delim lf
  96     (data_line lf)*
  97     delim lf;
  98
  99      # note: declen indicates the length of binary_data in bytes.
 100      # declen does not include the lf preceeding the binary data.
 101      #
 102   exact_data ::= 'data' sp declen lf
 103     binary_data;
 104
 105      # note: quoted strings are C-style quoting supporting \c for
 106      # common escapes of 'c' (e..g \n, \t, \\, \") or \nnn where nnn
 107      # is the signed byte value in octal.  Note that the only
 108      # characters which must actually be escaped to protect the
 109      # stream formatting is: \, \" and LF.  Otherwise these values
 110      # are UTF8.
 111      #
 112   ref_str     ::= ref;
 113   sha1exp_str ::= sha1exp;
 114   tag_str     ::= tag;
 115   path_str    ::= path    | '"' quoted(path)    '"' ;
 116   mode        ::= '100644' | '644'
 117                 | '100755' | '755'
 118                 | '120000'
 119                 ;
 120
 121   declen ::= # unsigned 32 bit value, ascii base10 notation;
 122   bigint ::= # unsigned integer value, ascii base10 notation;
 123   binary_data ::= # file content, not interpreted;
 124
 125   when         ::= raw_when | rfc2822_when;
 126   raw_when     ::= ts sp tz;
 127   rfc2822_when ::= # Valid RFC 2822 date and time;
 128
 129   sp ::= # ASCII space character;
 130   lf ::= # ASCII newline (LF) character;
 131
 132      # note: a colon (':') must precede the numerical value assigned to
 133      # an idnum.  This is to distinguish it from a ref or tag name as
 134      # GIT does not permit ':' in ref or tag strings.
 135      #
 136   idnum   ::= ':' bigint;
 137   path    ::= # GIT style file path, e.g. \"a/b/c\";
 138   ref     ::= # GIT ref name, e.g. \"refs/heads/MOZ_GECKO_EXPERIMENT\";
 139   tag     ::= # GIT tag name, e.g. \"FIREFOX_1_5\";
 140   sha1exp ::= # Any valid GIT SHA1 expression;
 141   hexsha1 ::= # SHA1 in hexadecimal format;
 142
 143      # note: name and email are UTF8 strings, however name must not
 144      # contain '<' or lf and email must not contain any of the
 145      # following: '<', '>', lf.
 146      #
 147   name  ::= # valid GIT author/committer name;
 148   email ::= # valid GIT author/committer email;
 149   ts    ::= # time since the epoch in seconds, ascii base10 notation;
 150   tz    ::= # GIT style timezone;
 151
 152      # note: comments may appear anywhere in the input, except
 153      # within a data command.  Any form of the data command
 154      # always escapes the related input from comment processing.
 155      #
 156      # In case it is not clear, the '#' that starts the comment
 157      # must be the first character on that the line (an lf have
 158      # preceeded it).
 159      #
 160   comment ::= '#' not_lf* lf;
 161   not_lf  ::= # Any byte that is not ASCII newline (LF);
 162 """
 163
 164
 165 import re
 166 import sys
 167
 168 from git_remote_helpers.fastimport import (
 169     commands,
 170     dates,
 171     errors
 172     )
 173
 174
 175 ## Stream parsing ##
 176
 177 class LineBasedParser(object):
 178
 179     def __init__(self, input, filename=None):
 180         """A Parser that keeps track of line numbers.
 181
 182         :param input: the file-like object to read from
 183         """
 184         self.input = input
 185         if filename is None:
 186             try:
 187                 self.filename = input.name
 188             except AttributeError:
 189                 self.filename = "(unknown)"
 190         else:
 191             self.filename = filename
 192         self.lineno = 0
 193         # Lines pushed back onto the input stream
 194         self._buffer = []
 195
 196     def abort(self, exception, *args):
 197         """Raise an exception providing line number information."""
 198         raise exception(self.filename, self.lineno, *args)
 199
 200     def readline(self):
 201         """Get the next line including the newline or '' on EOF."""
 202         self.lineno += 1
 203         if self._buffer:
 204             return self._buffer.pop()
 205         else:
 206             return self.input.readline()
 207
 208     def next_line(self):
 209         """Get the next line without the newline or None on EOF."""
 210         line = self.readline()
 211         if line:
 212             return line[:-1]
 213         else:
 214             return None
 215
 216     def push_line(self, line):
 217         """Push line back onto the line buffer.
 218
 219         :param line: the line with no trailing newline
 220         """
 221         self.lineno -= 1
 222         self._buffer.append(line + "\n")
 223
 224     def read_bytes(self, count):
 225         """Read a given number of bytes from the input stream.
 226
 227         Throws MissingBytes if the bytes are not found.
 228
 229         Note: This method does not read from the line buffer.
 230
 231         :return: a string
 232         """
 233         result = self.input.read(count)
 234         found = len(result)
 235         self.lineno += result.count("\n")
 236         if found != count:
 237             self.abort(errors.MissingBytes, count, found)
 238         return result
 239
 240     def read_until(self, terminator):
 241         """Read the input stream until the terminator is found.
 242
 243         Throws MissingTerminator if the terminator is not found.
 244
 245         Note: This method does not read from the line buffer.
 246
 247         :return: the bytes read up to but excluding the terminator.
 248         """
 249
 250         lines = []
 251         term = terminator + '\n'
 252         while True:
 253             line = self.input.readline()
 254             if line == term:
 255                 break
 256             else:
 257                 lines.append(line)
 258         return ''.join(lines)
 259
 260
 261 # Regular expression used for parsing. (Note: The spec states that the name
 262 # part should be non-empty but git-fast-export doesn't always do that so
 263 # the first bit is \w*, not \w+.) Also git-fast-import code says the
 264 # space before the email is optional.
 265 _WHO_AND_WHEN_RE = re.compile(r'([^<]*)<(.*)> (.+)')
 266 _WHO_RE = re.compile(r'([^<]*)<(.*)>')
 267
 268
 269 class ImportParser(LineBasedParser):
 270
 271     def __init__(self, input, filename=None):
 272         """A Parser of import commands.
 273
 274         :param input: the file-like object to read from
 275         :param verbose: display extra information of not
 276         """
 277         LineBasedParser.__init__(self, input, filename)
 278
 279         # We auto-detect the date format when a date is first encountered
 280         self.date_parser = None
 281
 282     def warning(self, msg):
 283         sys.stderr.write("warning line %d: %s\n" % (self.lineno, msg))
 284
 285     def parse(self):
 286         """Parse the input stream, yielding a sequence of ImportCommand
 287         objects.  Iteration terminates on EOF.  Raises InvalidCommand on
 288         parse error."""
 289         while True:
 290             line = self.next_line()
 291             if line is None:
 292                 break
 293             elif len(line) == 0 or line.startswith('#'):
 294                 continue
 295             # Search for commands in order of likelihood
 296             elif line.startswith('commit '):
 297                 yield self._parse_commit(line[len('commit '):])
 298             elif line.startswith('blob'):
 299                 yield self._parse_blob()
 300             elif line.startswith('done'):
 301                 break
 302             elif line.startswith('progress '):
 303                 yield commands.ProgressCommand(line[len('progress '):])
 304             elif line.startswith('reset '):
 305                 yield self._parse_reset(line[len('reset '):])
 306             elif line.startswith('tag '):
 307                 yield self._parse_tag(line[len('tag '):])
 308             elif line.startswith('checkpoint'):
 309                 yield commands.CheckpointCommand()
 310             elif line.startswith('feature'):
 311                 yield self._parse_feature(line[len('feature '):])
 312             else:
 313                 self.abort(errors.InvalidCommand, line)
 314
 315     def iter_commands(self):
 316         warnings.warn("iter_commands() deprecated: use parse()",
 317                       DeprecationWarning, stacklevel=2)
 318         return self.parse()
 319
 320     def iter_file_commands(self):
 321         """Iterator returning FileCommand objects.
 322
 323         If an invalid file command is found, the line is silently
 324         pushed back and iteration ends.
 325         """
 326         while True:
 327             line = self.next_line()
 328             if line is None:
 329                 break
 330             elif len(line) == 0 or line.startswith('#'):
 331                 continue
 332             # Search for file commands in order of likelihood
 333             elif line.startswith('M '):
 334                 yield self._parse_file_modify(line[2:])
 335             elif line.startswith('D '):
 336                 path = self._path(line[2:])
 337                 yield commands.FileDeleteCommand(path)
 338             elif line.startswith('R '):
 339                 old, new = self._path_pair(line[2:])
 340                 yield commands.FileRenameCommand(old, new)
 341             elif line.startswith('C '):
 342                 src, dest = self._path_pair(line[2:])
 343                 yield commands.FileCopyCommand(src, dest)
 344             elif line.startswith('deleteall'):
 345                 yield commands.FileDeleteAllCommand()
 346             else:
 347                 self.push_line(line)
 348                 break
 349
 350     def _parse_blob(self):
 351         """Parse a blob command."""
 352         location = (self.filename, self.lineno)
 353         mark = self._get_mark_if_any()
 354         data = self._get_data('blob')
 355         return commands.BlobCommand(mark, data, location)
 356
 357     def _parse_commit(self, ref):
 358         """Parse a commit command."""
 359         location = (self.filename, self.lineno)
 360         mark = self._get_mark_if_any()
 361         author = self._get_user_info('commit', 'author', False)
 362         more_authors = []
 363         while True:
 364             another_author = self._get_user_info('commit', 'author', False)
 365             if another_author is not None:
 366                 more_authors.append(another_author)
 367             else:
 368                 break
 369         committer = self._get_user_info('commit', 'committer')
 370         message = self._get_data('commit', 'message')
 371         try:
 372             message = message.decode('utf_8')
 373         except UnicodeDecodeError:
 374             self.warning(
 375                 "commit message not in utf8 - replacing unknown characters")
 376             message = message.decode('utf_8', 'replace')
 377         from_ = self._get_from()
 378         merges = []
 379         while True:
 380             merge = self._get_merge()
 381             if merge is not None:
 382                 # while the spec suggests it's illegal, git-fast-export
 383                 # outputs multiple merges on the one line, e.g.
 384                 # merge :x :y :z
 385                 these_merges = merge.split(" ")
 386                 merges.extend(these_merges)
 387             else:
 388                 break
 389         properties = {}
 390         while True:
 391             name_value = self._get_property()
 392             if name_value is not None:
 393                 name, value = name_value
 394                 properties[name] = value
 395             else:
 396                 break
 397         file_cmds = list(self.iter_file_commands())
 398         return commands.CommitCommand(ref, mark, author, committer, message,
 399             from_, merges, file_cmds, location,
 400             more_authors=more_authors, properties=properties)
 401
 402     def _parse_feature(self, info):
 403         """Parse a feature command."""
 404         parts = info.split("=", 1)
 405         name = parts[0]
 406         if len(parts) > 1:
 407             value = self._path(parts[1])
 408         else:
 409             value = None
 410         location = (self.filename, self.lineno)
 411         return commands.FeatureCommand(name, value, location=location)
 412
 413
 414     def _parse_file_modify(self, info):
 415         """Parse a filemodify command within a commit.
 416
 417         :param info: a string in the format "mode dataref path"
 418           (where dataref might be the hard-coded literal 'inline').
 419         """
 420         params = info.split(' ', 2)
 421         path = self._path(params[2])
 422         mode = params[0]
 423         if params[1] == 'inline':
 424             dataref = None
 425             data = self._get_data('filemodify')
 426         else:
 427             dataref = params[1]
 428             data = None
 429         return commands.FileModifyCommand(path, mode, dataref, data)
 430
 431     def _parse_reset(self, ref):
 432         """Parse a reset command."""
 433         from_ = self._get_from()
 434         return commands.ResetCommand(ref, from_)
 435
 436     def _parse_tag(self, name):
 437         """Parse a tag command."""
 438         from_ = self._get_from('tag')
 439         tagger = self._get_user_info('tag', 'tagger', accept_just_who=True)
 440         message = self._get_data('tag', 'message').decode('utf_8')
 441         return commands.TagCommand(name, from_, tagger, message)
 442
 443     def _get_mark_if_any(self):
 444         """Parse a mark section."""
 445         line = self.next_line()
 446         if line.startswith('mark :'):
 447             return line[len('mark :'):]
 448         else:
 449             self.push_line(line)
 450             return None
 451
 452     def _get_from(self, required_for=None):
 453         """Parse a from section."""
 454         line = self.next_line()
 455         if line is None:
 456             return None
 457         elif line.startswith('from '):
 458             return line[len('from '):]
 459         elif required_for:
 460             self.abort(errors.MissingSection, required_for, 'from')
 461         else:
 462             self.push_line(line)
 463             return None
 464
 465     def _get_merge(self):
 466         """Parse a merge section."""
 467         line = self.next_line()
 468         if line is None:
 469             return None
 470         elif line.startswith('merge '):
 471             return line[len('merge '):]
 472         else:
 473             self.push_line(line)
 474             return None
 475
 476     def _get_property(self):
 477         """Parse a property section."""
 478         line = self.next_line()
 479         if line is None:
 480             return None
 481         elif line.startswith('property '):
 482             return self._name_value(line[len('property '):])
 483         else:
 484             self.push_line(line)
 485             return None
 486
 487     def _get_user_info(self, cmd, section, required=True,
 488         accept_just_who=False):
 489         """Parse a user section."""
 490         line = self.next_line()
 491         if line.startswith(section + ' '):
 492             return self._who_when(line[len(section + ' '):], cmd, section,
 493                 accept_just_who=accept_just_who)
 494         elif required:
 495             self.abort(errors.MissingSection, cmd, section)
 496         else:
 497             self.push_line(line)
 498             return None
 499
 500     def _get_data(self, required_for, section='data'):
 501         """Parse a data section."""
 502         line = self.next_line()
 503         if line.startswith('data '):
 504             rest = line[len('data '):]
 505             if rest.startswith('<<'):
 506                 return self.read_until(rest[2:])
 507             else:
 508                 size = int(rest)
 509                 read_bytes = self.read_bytes(size)
 510                 # optional LF after data.
 511                 next = self.input.readline()
 512                 self.lineno += 1
 513                 if len(next) > 1 or next != "\n":
 514                     self.push_line(next[:-1])
 515                 return read_bytes
 516         else:
 517             self.abort(errors.MissingSection, required_for, section)
 518
 519     def _who_when(self, s, cmd, section, accept_just_who=False):
 520         """Parse who and when information from a string.
 521
 522         :return: a tuple of (name,email,timestamp,timezone). name may be
 523             the empty string if only an email address was given.
 524         """
 525         match = _WHO_AND_WHEN_RE.search(s)
 526         if match:
 527             datestr = match.group(3)
 528             if self.date_parser is None:
 529                 # auto-detect the date format
 530                 if len(datestr.split(' ')) == 2:
 531                     format = 'raw'
 532                 elif datestr == 'now':
 533                     format = 'now'
 534                 else:
 535                     format = 'rfc2822'
 536                 self.date_parser = dates.DATE_PARSERS_BY_NAME[format]
 537             when = self.date_parser(datestr, self.lineno)
 538         else:
 539             match = _WHO_RE.search(s)
 540             if accept_just_who and match:
 541                 # HACK around missing time
 542                 # TODO: output a warning here
 543                 when = dates.DATE_PARSERS_BY_NAME['now']('now')
 544             else:
 545                 self.abort(errors.BadFormat, cmd, section, s)
 546
 547         # Do not attempt to decode name or email address; they are just
 548         # bytes.  (Everything will work out better if they are in UTF-8,
 549         # but that's not guaranteed.)
 550         name = match.group(1).rstrip()
 551         email = match.group(2)
 552         return (name, email, when[0], when[1])
 553
 554     def _name_value(self, s):
 555         """Parse a (name,value) tuple from 'name value-length value'."""
 556         parts = s.split(' ', 2)
 557         name = parts[0]
 558         if len(parts) == 1:
 559             value = None
 560         else:
 561             size = int(parts[1])
 562             value = parts[2]
 563             still_to_read = size - len(value)
 564             if still_to_read == 1:
 565                 value += "\n"
 566             elif still_to_read > 0:
 567                 read_bytes = self.read_bytes(still_to_read - 1)
 568                 value += "\n" + read_bytes
 569             value = value.decode('utf8')
 570         return (name, value)
 571
 572     def _path(self, s):
 573         """Parse a path."""
 574         if s.startswith('"'):
 575             if s[-1] != '"':
 576                 self.abort(errors.BadFormat, '?', '?', s)
 577             else:
 578                 return _unquote_c_string(s[1:-1])
 579
 580         # Do *not* decode the path to a Unicode string: filenames on
 581         # Unix are just bytes.  Git and Mercurial, at least, inherit
 582         # this stance.  git-fast-import(1) merely says "It is
 583         # recommended that <path> always be encoded using UTF-8.", which
 584         # is good advice ... but not something we can count on here.
 585         return s
 586
 587     def _path_pair(self, s):
 588         """Parse two paths separated by a space."""
 589         # TODO: handle a space in the first path
 590         if s.startswith('"'):
 591             parts = s[1:].split('" ', 1)
 592         else:
 593             parts = s.split(' ', 1)
 594         if len(parts) != 2:
 595             self.abort(errors.BadFormat, '?', '?', s)
 596         elif parts[1].startswith('"') and parts[1].endswith('"'):
 597             parts[1] = parts[1][1:-1]
 598         elif parts[1].startswith('"') or parts[1].endswith('"'):
 599             self.abort(errors.BadFormat, '?', '?', s)
 600         return map(_unquote_c_string, parts)
 601
 602     def _mode(self, s):
 603         """Parse a file mode into executable and symlink flags.
 604
 605         :return (is_executable, is_symlink)
 606         """
 607         # Note: Output from git-fast-export slightly different to spec
 608         if s in ['644', '100644', '0100644']:
 609             return False, False
 610         elif s in ['755', '100755', '0100755']:
 611             return True, False
 612         elif s in ['120000', '0120000']:
 613             return False, True
 614         else:
 615             self.abort(errors.BadFormat, 'filemodify', 'mode', s)
 616
 617
 618 def _unquote_c_string(s):
 619     """replace C-style escape sequences (\n, \", etc.) with real chars."""
 620     # HACK: Python strings are close enough
 621     return s.decode('string_escape', 'replace')