cvs2svn_lib/dumpfile_delegate.py

   1 # (Be in -*- python -*- mode.)
   2 #
   3 # ====================================================================
   4 # Copyright (c) 2000-2006 CollabNet.  All rights reserved.
   5 #
   6 # This software is licensed as described in the file COPYING, which
   7 # you should have received as part of this distribution.  The terms
   8 # are also available at http://subversion.tigris.org/license-1.html.
   9 # If newer versions of this license are posted there, you may use a
  10 # newer version instead, at your option.
  11 #
  12 # This software consists of voluntary contributions made by many
  13 # individuals.  For exact contribution history, see the revision
  14 # history and logs, available at http://cvs2svn.tigris.org/.
  15 # ====================================================================
  16
  17 """This module contains database facilities used by cvs2svn."""
  18
  19
  20 import os
  21 import md5
  22
  23 from boolean import *
  24 import common
  25 from common import FatalError
  26 import config
  27 from context import Ctx
  28 from svn_repository_mirror import SVNRepositoryMirrorDelegate
  29
  30
  31 class DumpfileDelegate(SVNRepositoryMirrorDelegate):
  32   """Create a Subversion dumpfile."""
  33
  34   def __init__(self, dumpfile_path=None):
  35     """Return a new DumpfileDelegate instance, attached to a dumpfile
  36     DUMPFILE_PATH (Ctx().dumpfile, if None), using Ctx().encoding."""
  37
  38     if dumpfile_path:
  39       self.dumpfile_path = dumpfile_path
  40     else:
  41       self.dumpfile_path = Ctx().dumpfile
  42
  43     self.dumpfile = open(self.dumpfile_path, 'wb')
  44     self._write_dumpfile_header(self.dumpfile)
  45
  46   def _write_dumpfile_header(self, dumpfile):
  47     # Initialize the dumpfile with the standard headers.
  48     #
  49     # Since the CVS repository doesn't have a UUID, and the Subversion
  50     # repository will be created with one anyway, we don't specify a
  51     # UUID in the dumpflie
  52     dumpfile.write('SVN-fs-dump-format-version: 2\n\n')
  53
  54   def _utf8_path(self, path):
  55     """Return a copy of PATH encoded in UTF-8."""
  56
  57     pieces = path.split('/')
  58     # Convert each path component separately (as they may each use
  59     # different encodings).
  60     for i in range(len(pieces)):
  61       try:
  62         # Log messages can be converted with the 'replace' strategy,
  63         # but we can't afford any lossiness here.
  64         pieces[i] = Ctx().to_utf8(pieces[i], 'strict')
  65       except UnicodeError:
  66         raise FatalError(
  67             "Unable to convert a path '%s' to internal encoding.\n"
  68             "Consider rerunning with one or more '--encoding' parameters."
  69             % (path,))
  70     return '/'.join(pieces)
  71
  72   def _string_for_prop(self, name, value):
  73     """Return a property in the form needed for the dumpfile."""
  74
  75     return 'K %d\n%s\nV %d\n%s\n' % (len(name), name, len(value), value)
  76
  77   def start_commit(self, svn_commit):
  78     """Emit the start of SVN_COMMIT (an SVNCommit)."""
  79
  80     self.revision = svn_commit.revnum
  81
  82     # The start of a new commit typically looks like this:
  83     #
  84     #   Revision-number: 1
  85     #   Prop-content-length: 129
  86     #   Content-length: 129
  87     #
  88     #   K 7
  89     #   svn:log
  90     #   V 27
  91     #   Log message for revision 1.
  92     #   K 10
  93     #   svn:author
  94     #   V 7
  95     #   jrandom
  96     #   K 8
  97     #   svn:date
  98     #   V 27
  99     #   2003-04-22T22:57:58.132837Z
 100     #   PROPS-END
 101     #
 102     # Notice that the length headers count everything -- not just the
 103     # length of the data but also the lengths of the lengths, including
 104     # the 'K ' or 'V ' prefixes.
 105     #
 106     # The reason there are both Prop-content-length and Content-length
 107     # is that the former includes just props, while the latter includes
 108     # everything.  That's the generic header form for any entity in a
 109     # dumpfile.  But since revisions only have props, the two lengths
 110     # are always the same for revisions.
 111
 112     # Calculate the output needed for the property definitions.
 113     props = svn_commit.get_revprops()
 114     prop_names = props.keys()
 115     prop_names.sort()
 116     prop_strings = []
 117     for propname in prop_names:
 118       if props[propname] is not None:
 119         prop_strings.append(self._string_for_prop(propname, props[propname]))
 120
 121     all_prop_strings = ''.join(prop_strings) + 'PROPS-END\n'
 122     total_len = len(all_prop_strings)
 123
 124     # Print the revision header and props
 125     self.dumpfile.write('Revision-number: %d\n'
 126                         'Prop-content-length: %d\n'
 127                         'Content-length: %d\n'
 128                         '\n'
 129                         % (self.revision, total_len, total_len))
 130
 131     self.dumpfile.write(all_prop_strings)
 132     self.dumpfile.write('\n')
 133
 134   def mkdir(self, path):
 135     """Emit the creation of directory PATH."""
 136
 137     self.dumpfile.write("Node-path: %s\n"
 138                         "Node-kind: dir\n"
 139                         "Node-action: add\n"
 140                         "\n"
 141                         "\n" % self._utf8_path(path))
 142
 143   def _add_or_change_path(self, s_item, op):
 144     """Emit the addition or change corresponding to S_ITEM.
 145     OP is either the constant OP_ADD or OP_CHANGE."""
 146
 147     # Validation stuffs
 148     if op == common.OP_ADD:
 149       action = 'add'
 150     elif op == common.OP_CHANGE:
 151       action = 'change'
 152     else:
 153       raise FatalError("_add_or_change_path() called with bad op ('%s')"
 154                        % (op,))
 155
 156     # Convenience variables
 157     c_rev = s_item.c_rev
 158
 159     # The property handling here takes advantage of an undocumented
 160     # but IMHO consistent feature of the Subversion dumpfile-loading
 161     # code.  When a node's properties aren't mentioned (that is, the
 162     # "Prop-content-length:" header is absent, no properties are
 163     # listed at all, and there is no "PROPS-END\n" line) then no
 164     # change is made to the node's properties.
 165     #
 166     # This is consistent with the way dumpfiles behave w.r.t. text
 167     # content changes, so I'm comfortable relying on it.  If you
 168     # commit a change to *just* the properties of some node that
 169     # already has text contents from a previous revision, then in the
 170     # dumpfile output for the prop change, no "Text-content-length:"
 171     # nor "Text-content-md5:" header will be present, and the text of
 172     # the file will not be given.  But this does not cause the file's
 173     # text to be erased!  It simply remains unchanged.
 174     #
 175     # This works out great for cvs2svn, due to lucky coincidences:
 176     #
 177     # For files, the only properties we ever set are set in the first
 178     # revision; all other revisions (including on branches) inherit
 179     # from that.  After the first revision, we never change file
 180     # properties, therefore, there is no need to remember the full set
 181     # of properties on a given file once we've set it.
 182     #
 183     # For directories, the only property we set is "svn:ignore", and
 184     # while we may change it after the first revision, we always do so
 185     # based on the contents of a ".cvsignore" file -- in other words,
 186     # CVS is doing the remembering for us, so we still don't have to
 187     # preserve the previous value of the property ourselves.
 188
 189     # Calculate the (sorted-by-name) property string and length, if any.
 190     if s_item.svn_props_changed:
 191       svn_props = s_item.svn_props
 192       prop_contents = ''
 193       prop_names = svn_props.keys()
 194       prop_names.sort()
 195       for pname in prop_names:
 196         pvalue = svn_props[pname]
 197         if pvalue is not None:
 198           prop_contents += self._string_for_prop(pname, pvalue)
 199       prop_contents += 'PROPS-END\n'
 200       props_header = 'Prop-content-length: %d\n' % len(prop_contents)
 201     else:
 202       prop_contents = ''
 203       props_header = ''
 204
 205     # treat .cvsignore as a directory property
 206     dir_path, basename = os.path.split(c_rev.svn_path)
 207     if basename == ".cvsignore":
 208       ignore_vals = generate_ignores(c_rev)
 209       ignore_contents = '\n'.join(ignore_vals)
 210       ignore_contents = ('K 10\nsvn:ignore\nV %d\n%s\n' % \
 211                          (len(ignore_contents), ignore_contents))
 212       ignore_contents += 'PROPS-END\n'
 213       ignore_len = len(ignore_contents)
 214
 215       # write headers, then props
 216       self.dumpfile.write('Node-path: %s\n'
 217                           'Node-kind: dir\n'
 218                           'Node-action: change\n'
 219                           'Prop-content-length: %d\n'
 220                           'Content-length: %d\n'
 221                           '\n'
 222                           '%s'
 223                           % (self._utf8_path(dir_path), ignore_len,
 224                              ignore_len, ignore_contents))
 225
 226     # If the file has keywords, we must prevent CVS/RCS from expanding
 227     # the keywords because they must be unexpanded in the repository,
 228     # or Subversion will get confused.
 229     pipe_cmd, pipe = Ctx().cvs_repository.get_co_pipe(
 230         c_rev, suppress_keyword_substitution=s_item.has_keywords)
 231
 232     self.dumpfile.write('Node-path: %s\n'
 233                         'Node-kind: file\n'
 234                         'Node-action: %s\n'
 235                         '%s'  # no property header if no props
 236                         'Text-content-length: '
 237                         % (self._utf8_path(c_rev.svn_path),
 238                            action, props_header))
 239
 240     pos = self.dumpfile.tell()
 241
 242     self.dumpfile.write('0000000000000000\n'
 243                         'Text-content-md5: 00000000000000000000000000000000\n'
 244                         'Content-length: 0000000000000000\n'
 245                         '\n')
 246
 247     if prop_contents:
 248       self.dumpfile.write(prop_contents)
 249
 250     # Insert a filter to convert all EOLs to LFs if neccessary
 251     if s_item.needs_eol_filter:
 252       data_reader = LF_EOL_Filter(pipe.stdout)
 253     else:
 254       data_reader = pipe.stdout
 255
 256     # Insert the rev contents, calculating length and checksum as we go.
 257     checksum = md5.new()
 258     length = 0
 259     while True:
 260       buf = data_reader.read(config.PIPE_READ_SIZE)
 261       if buf == '':
 262         break
 263       checksum.update(buf)
 264       length += len(buf)
 265       self.dumpfile.write(buf)
 266
 267     pipe.stdout.close()
 268     error_output = pipe.stderr.read()
 269     exit_status = pipe.wait()
 270     if exit_status:
 271       raise FatalError("The command '%s' failed with exit status: %s\n"
 272                        "and the following output:\n"
 273                        "%s" % (pipe_cmd, exit_status, error_output))
 274
 275     # Go back to patch up the length and checksum headers:
 276     self.dumpfile.seek(pos, 0)
 277     # We left 16 zeros for the text length; replace them with the real
 278     # length, padded on the left with spaces:
 279     self.dumpfile.write('%16d' % length)
 280     # 16... + 1 newline + len('Text-content-md5: ') == 35
 281     self.dumpfile.seek(pos + 35, 0)
 282     self.dumpfile.write(checksum.hexdigest())
 283     # 35... + 32 bytes of checksum + 1 newline + len('Content-length: ') == 84
 284     self.dumpfile.seek(pos + 84, 0)
 285     # The content length is the length of property data, text data,
 286     # and any metadata around/inside around them.
 287     self.dumpfile.write('%16d' % (length + len(prop_contents)))
 288     # Jump back to the end of the stream
 289     self.dumpfile.seek(0, 2)
 290
 291     # This record is done (write two newlines -- one to terminate
 292     # contents that weren't themselves newline-termination, one to
 293     # provide a blank line for readability.
 294     self.dumpfile.write('\n\n')
 295
 296   def add_path(self, s_item):
 297     """Emit the addition corresponding to S_ITEM, an SVNCommitItem."""
 298
 299     self._add_or_change_path(s_item, common.OP_ADD)
 300
 301   def change_path(self, s_item):
 302     """Emit the change corresponding to S_ITEM, an SVNCommitItem."""
 303
 304     self._add_or_change_path(s_item, common.OP_CHANGE)
 305
 306   def delete_path(self, path):
 307     """Emit the deletion of PATH."""
 308
 309     self.dumpfile.write('Node-path: %s\n'
 310                         'Node-action: delete\n'
 311                         '\n' % self._utf8_path(path))
 312
 313   def copy_path(self, src_path, dest_path, src_revnum):
 314     """Emit the copying of SRC_PATH at SRC_REV to DEST_PATH."""
 315
 316     # We don't need to include "Node-kind:" for copies; the loader
 317     # ignores it anyway and just uses the source kind instead.
 318     self.dumpfile.write('Node-path: %s\n'
 319                         'Node-action: add\n'
 320                         'Node-copyfrom-rev: %d\n'
 321                         'Node-copyfrom-path: /%s\n'
 322                         '\n'
 323                         % (self._utf8_path(dest_path),
 324                            src_revnum,
 325                            self._utf8_path(src_path)))
 326
 327   def finish(self):
 328     """Perform any cleanup necessary after all revisions have been
 329     committed."""
 330
 331     self.dumpfile.close()
 332
 333
 334 def generate_ignores(c_rev):
 335   # Read in props
 336   pipe_cmd, pipe = Ctx().cvs_repository.get_co_pipe(c_rev)
 337   buf = pipe.stdout.read(config.PIPE_READ_SIZE)
 338   raw_ignore_val = ""
 339   while buf:
 340     raw_ignore_val += buf
 341     buf = pipe.stdout.read(config.PIPE_READ_SIZE)
 342   pipe.stdout.close()
 343   error_output = pipe.stderr.read()
 344   exit_status = pipe.wait()
 345   if exit_status:
 346     raise FatalError("The command '%s' failed with exit status: %s\n"
 347                      "and the following output:\n"
 348                      "%s" % (pipe_cmd, exit_status, error_output))
 349
 350   # Tweak props: First, convert any spaces to newlines...
 351   raw_ignore_val = '\n'.join(raw_ignore_val.split())
 352   raw_ignores = raw_ignore_val.split('\n')
 353   ignore_vals = [ ]
 354   for ignore in raw_ignores:
 355     # Reset the list if we encounter a '!'
 356     # See http://cvsbook.red-bean.com/cvsbook.html#cvsignore
 357     if ignore == '!':
 358       ignore_vals = [ ]
 359       continue
 360     # Skip empty lines
 361     if len(ignore) == 0:
 362       continue
 363     ignore_vals.append(ignore)
 364   return ignore_vals
 365
 366
 367 class LF_EOL_Filter:
 368   """Filter a stream and convert all end-of-line markers (CRLF, CR or LF)
 369   into LFs only."""
 370
 371   def __init__(self, stream):
 372     self.stream = stream
 373     self.carry_cr = False
 374     self.eof = False
 375
 376   def read(self, size):
 377     while True:
 378       buf = self.stream.read(size)
 379       self.eof = len(buf) == 0
 380       if self.carry_cr:
 381         buf = '\r' + buf
 382         self.carry_cr = False
 383       if not self.eof and buf[-1] == '\r':
 384         self.carry_cr = True
 385         buf = buf[:-1]
 386       buf = buf.replace('\r\n', '\n')
 387       buf = buf.replace('\r', '\n')
 388       if len(buf) > 0 or self.eof:
 389         return buf
 390
 391