cvs2svn_lib/dumpfile_delegate.py

   1 # (Be in -*- python -*- mode.)
   2 #
   3 # ====================================================================
   4 # Copyright (c) 2000-2009 CollabNet.  All rights reserved.
   5 #
   6 # This software is licensed as described in the file COPYING, which
   7 # you should have received as part of this distribution.  The terms
   8 # are also available at http://subversion.tigris.org/license-1.html.
   9 # If newer versions of this license are posted there, you may use a
  10 # newer version instead, at your option.
  11 #
  12 # This software consists of voluntary contributions made by many
  13 # individuals.  For exact contribution history, see the revision
  14 # history and logs, available at http://cvs2svn.tigris.org/.
  15 # ====================================================================
  16
  17 """This module contains database facilities used by cvs2svn."""
  18
  19
  20 try:
  21   from hashlib import md5
  22 except ImportError:
  23   from md5 import new as md5
  24
  25
  26 from cvs2svn_lib import config
  27 from cvs2svn_lib.common import FatalError
  28 from cvs2svn_lib.common import InternalError
  29 from cvs2svn_lib.common import path_split
  30 from cvs2svn_lib.context import Ctx
  31 from cvs2svn_lib.cvs_path import CVSDirectory
  32 from cvs2svn_lib.cvs_path import CVSFile
  33 from cvs2svn_lib.svn_repository_delegate import SVNRepositoryDelegate
  34 from cvs2svn_lib.apple_single_filter import get_maybe_apple_single_stream
  35
  36
  37 # Things that can happen to a file.
  38 OP_ADD    = 'add'
  39 OP_CHANGE = 'change'
  40
  41
  42 class DumpfileDelegate(SVNRepositoryDelegate):
  43   """Create a Subversion dumpfile."""
  44
  45   def __init__(self, revision_reader, dumpfile_path):
  46     """Return a new DumpfileDelegate instance, attached to a dumpfile
  47     DUMPFILE_PATH, using Ctx().cvs_filename_decoder()."""
  48
  49     self._revision_reader = revision_reader
  50     self.dumpfile_path = dumpfile_path
  51
  52     self.dumpfile = open(self.dumpfile_path, 'wb')
  53     self._write_dumpfile_header(self.dumpfile)
  54
  55     # A set of the basic project infrastructure project directories
  56     # that have been created so far, as SVN paths.  (The root
  57     # directory is considered to be present at initialization.)  This
  58     # includes all of the LOD paths, and all of their parent
  59     # directories etc.
  60     self._basic_directories = set([''])
  61
  62   def _write_dumpfile_header(self, dumpfile):
  63     # Initialize the dumpfile with the standard headers.
  64     #
  65     # Since the CVS repository doesn't have a UUID, and the Subversion
  66     # repository will be created with one anyway, we don't specify a
  67     # UUID in the dumpflie
  68     dumpfile.write('SVN-fs-dump-format-version: 2\n\n')
  69
  70   def _utf8_path(self, path):
  71     """Return a copy of PATH encoded in UTF-8."""
  72
  73     # Convert each path component separately (as they may each use
  74     # different encodings).
  75     try:
  76       return '/'.join([
  77           Ctx().cvs_filename_decoder(piece).encode('utf8')
  78           for piece in path.split('/')
  79           ])
  80     except UnicodeError:
  81       raise FatalError(
  82           "Unable to convert a path '%s' to internal encoding.\n"
  83           "Consider rerunning with one or more '--encoding' parameters or\n"
  84           "with '--fallback-encoding'."
  85           % (path,))
  86
  87   def _string_for_prop(self, name, value):
  88     """Return a property in the form needed for the dumpfile."""
  89
  90     return 'K %d\n%s\nV %d\n%s\n' % (len(name), name, len(value), value)
  91
  92   def start_commit(self, revnum, revprops):
  93     """Emit the start of SVN_COMMIT (an SVNCommit)."""
  94
  95     self.revision = revnum
  96
  97     # The start of a new commit typically looks like this:
  98     #
  99     #   Revision-number: 1
 100     #   Prop-content-length: 129
 101     #   Content-length: 129
 102     #
 103     #   K 7
 104     #   svn:log
 105     #   V 27
 106     #   Log message for revision 1.
 107     #   K 10
 108     #   svn:author
 109     #   V 7
 110     #   jrandom
 111     #   K 8
 112     #   svn:date
 113     #   V 27
 114     #   2003-04-22T22:57:58.132837Z
 115     #   PROPS-END
 116     #
 117     # Notice that the length headers count everything -- not just the
 118     # length of the data but also the lengths of the lengths, including
 119     # the 'K ' or 'V ' prefixes.
 120     #
 121     # The reason there are both Prop-content-length and Content-length
 122     # is that the former includes just props, while the latter includes
 123     # everything.  That's the generic header form for any entity in a
 124     # dumpfile.  But since revisions only have props, the two lengths
 125     # are always the same for revisions.
 126
 127     # Calculate the output needed for the property definitions.
 128     prop_names = revprops.keys()
 129     prop_names.sort()
 130     prop_strings = []
 131     for propname in prop_names:
 132       if revprops[propname] is not None:
 133         prop_strings.append(
 134             self._string_for_prop(propname, revprops[propname]))
 135
 136     all_prop_strings = ''.join(prop_strings) + 'PROPS-END\n'
 137     total_len = len(all_prop_strings)
 138
 139     # Print the revision header and revprops
 140     self.dumpfile.write(
 141         'Revision-number: %d\n'
 142         'Prop-content-length: %d\n'
 143         'Content-length: %d\n'
 144         '\n'
 145         '%s'
 146         '\n'
 147         % (self.revision, total_len, total_len, all_prop_strings)
 148         )
 149
 150   def end_commit(self):
 151     pass
 152
 153   def _make_any_dir(self, path):
 154     """Emit the creation of directory PATH."""
 155
 156     self.dumpfile.write(
 157         "Node-path: %s\n"
 158         "Node-kind: dir\n"
 159         "Node-action: add\n"
 160         "\n"
 161         "\n"
 162         % self._utf8_path(path)
 163         )
 164
 165   def _register_basic_directory(self, path, create):
 166     """Register the creation of PATH if it is not already there.
 167
 168     Create any parent directories that do not already exist.  If
 169     CREATE is set, also create PATH if it doesn't already exist.  This
 170     method should only be used for the LOD paths and the directories
 171     containing them, not for directories within an LOD path."""
 172
 173     if path not in self._basic_directories:
 174       # Make sure that the parent directory is present:
 175       self._register_basic_directory(path_split(path)[0], True)
 176       if create:
 177         self._make_any_dir(path)
 178       self._basic_directories.add(path)
 179
 180   def initialize_project(self, project):
 181     """Create any initial directories for the project.
 182
 183     The trunk, tags, and branches directories directories are created
 184     the first time the project is seen.  Be sure not to create parent
 185     directories that already exist (e.g., because two directories
 186     share part of their paths either within or across projects)."""
 187
 188     for path in project.get_initial_directories():
 189       self._register_basic_directory(path, True)
 190
 191   def initialize_lod(self, lod):
 192     lod_path = lod.get_path()
 193     if lod_path:
 194       self._register_basic_directory(lod_path, True)
 195
 196   def mkdir(self, lod, cvs_directory):
 197     self._make_any_dir(lod.get_path(cvs_directory.cvs_path))
 198
 199   def _add_or_change_path(self, cvs_rev, op):
 200     """Emit the addition or change corresponding to CVS_REV.
 201
 202     OP is either the constant OP_ADD or OP_CHANGE."""
 203
 204     assert op in [OP_ADD, OP_CHANGE]
 205
 206     # The property handling here takes advantage of an undocumented
 207     # but IMHO consistent feature of the Subversion dumpfile-loading
 208     # code.  When a node's properties aren't mentioned (that is, the
 209     # "Prop-content-length:" header is absent, no properties are
 210     # listed at all, and there is no "PROPS-END\n" line) then no
 211     # change is made to the node's properties.
 212     #
 213     # This is consistent with the way dumpfiles behave w.r.t. text
 214     # content changes, so I'm comfortable relying on it.  If you
 215     # commit a change to *just* the properties of some node that
 216     # already has text contents from a previous revision, then in the
 217     # dumpfile output for the prop change, no "Text-content-length:"
 218     # nor "Text-content-md5:" header will be present, and the text of
 219     # the file will not be given.  But this does not cause the file's
 220     # text to be erased!  It simply remains unchanged.
 221     #
 222     # This works out great for cvs2svn, due to lucky coincidences:
 223     #
 224     # For files, we set most properties in the first revision and
 225     # never change them.  (The only exception is the 'cvs2svn:cvs-rev'
 226     # property.)  If 'cvs2svn:cvs-rev' is not being used, then there
 227     # is no need to remember the full set of properties on a given
 228     # file once we've set it.
 229     #
 230     # For directories, the only property we set is "svn:ignore", and
 231     # while we may change it after the first revision, we always do so
 232     # based on the contents of a ".cvsignore" file -- in other words,
 233     # CVS is doing the remembering for us, so we still don't have to
 234     # preserve the previous value of the property ourselves.
 235
 236     # Calculate the (sorted-by-name) property string and length, if any.
 237     svn_props = cvs_rev.get_properties()
 238     if cvs_rev.properties_changed:
 239       prop_contents = ''
 240       prop_names = svn_props.keys()
 241       prop_names.sort()
 242       for pname in prop_names:
 243         pvalue = svn_props[pname]
 244         prop_contents += self._string_for_prop(pname, pvalue)
 245       prop_contents += 'PROPS-END\n'
 246       props_header = 'Prop-content-length: %d\n' % len(prop_contents)
 247     else:
 248       prop_contents = ''
 249       props_header = ''
 250
 251     # If the file has keywords, we must prevent CVS/RCS from expanding
 252     # the keywords because they must be unexpanded in the repository,
 253     # or Subversion will get confused.
 254     has_keywords = bool(cvs_rev.get_properties().get('svn:keywords', None))
 255     stream = self._revision_reader.get_content_stream(
 256         cvs_rev, suppress_keyword_substitution=has_keywords
 257         )
 258
 259     if Ctx().decode_apple_single:
 260       # Insert a filter to decode any files that are in AppleSingle
 261       # format:
 262       stream = get_maybe_apple_single_stream(stream)
 263
 264     # Insert a filter to convert all EOLs to LFs if neccessary
 265
 266     eol_style = svn_props.get('svn:eol-style', None)
 267     if eol_style:
 268       stream = LF_EOL_Filter(stream, eol_style)
 269
 270     buf = None
 271
 272     # treat .cvsignore as a directory property
 273     dir_path, basename = path_split(cvs_rev.get_svn_path())
 274     if basename == '.cvsignore':
 275       buf = stream.read()
 276       ignore_vals = generate_ignores(buf)
 277       ignore_contents = '\n'.join(ignore_vals)
 278       if ignore_contents:
 279         ignore_contents += '\n'
 280       ignore_contents = ('K 10\nsvn:ignore\nV %d\n%s\n' % \
 281                          (len(ignore_contents), ignore_contents))
 282       ignore_contents += 'PROPS-END\n'
 283       ignore_len = len(ignore_contents)
 284
 285       # write headers, then props
 286       self.dumpfile.write(
 287           'Node-path: %s\n'
 288           'Node-kind: dir\n'
 289           'Node-action: change\n'
 290           'Prop-content-length: %d\n'
 291           'Content-length: %d\n'
 292           '\n'
 293           '%s'
 294           % (self._utf8_path(dir_path),
 295              ignore_len, ignore_len, ignore_contents)
 296           )
 297       if not Ctx().keep_cvsignore:
 298         stream.close()
 299         return
 300
 301     self.dumpfile.write(
 302         'Node-path: %s\n'
 303         'Node-kind: file\n'
 304         'Node-action: %s\n'
 305         '%s'  # no property header if no props
 306         % (self._utf8_path(cvs_rev.get_svn_path()), op, props_header)
 307         )
 308
 309     pos = self.dumpfile.tell()
 310
 311     content_header_fmt = (
 312         'Text-content-length: %16d\n'
 313         'Text-content-md5: %32s\n'
 314         'Content-length: %16d\n'
 315         '\n'
 316         )
 317
 318     self.dumpfile.write(content_header_fmt % (0, '', 0,))
 319
 320     if prop_contents:
 321       self.dumpfile.write(prop_contents)
 322
 323     # Insert the rev contents, calculating length and checksum.
 324     checksum = md5()
 325     if buf is None:
 326       buf = stream.read()
 327     else:
 328       buf = buf + stream.read()
 329     stream.close()
 330
 331     checksum.update(buf)
 332     length = len(buf)
 333     self.dumpfile.write(buf)
 334
 335     # Go back to overwrite the length and checksum headers with the
 336     # correct values.  The content length is the length of property
 337     # data, text data, and any metadata around/inside around them:
 338     self.dumpfile.seek(pos, 0)
 339     self.dumpfile.write(
 340         content_header_fmt
 341         % (length, checksum.hexdigest(), length + len(prop_contents),)
 342         )
 343
 344     # Jump back to the end of the stream
 345     self.dumpfile.seek(0, 2)
 346
 347     # This record is done (write two newlines -- one to terminate
 348     # contents that weren't themselves newline-termination, one to
 349     # provide a blank line for readability.
 350     self.dumpfile.write('\n\n')
 351
 352   def add_path(self, cvs_rev):
 353     """Emit the addition corresponding to CVS_REV, a CVSRevisionAdd."""
 354
 355     self._add_or_change_path(cvs_rev, OP_ADD)
 356
 357   def change_path(self, cvs_rev):
 358     """Emit the change corresponding to CVS_REV, a CVSRevisionChange."""
 359
 360     self._add_or_change_path(cvs_rev, OP_CHANGE)
 361
 362   def delete_lod(self, lod):
 363     """Emit the deletion of LOD."""
 364
 365     self.dumpfile.write(
 366         'Node-path: %s\n'
 367         'Node-action: delete\n'
 368         '\n'
 369         % (self._utf8_path(lod.get_path()),)
 370         )
 371     self._basic_directories.remove(lod.get_path())
 372
 373   def delete_path(self, lod, cvs_path):
 374     dir_path, basename = path_split(lod.get_path(cvs_path.get_cvs_path()))
 375     if basename == '.cvsignore':
 376       # When a .cvsignore file is deleted, the directory's svn:ignore
 377       # property needs to be deleted.
 378       ignore_contents = 'PROPS-END\n'
 379       ignore_len = len(ignore_contents)
 380
 381       # write headers, then props
 382       self.dumpfile.write(
 383           'Node-path: %s\n'
 384           'Node-kind: dir\n'
 385           'Node-action: change\n'
 386           'Prop-content-length: %d\n'
 387           'Content-length: %d\n'
 388           '\n'
 389           '%s'
 390           % (self._utf8_path(dir_path),
 391              ignore_len, ignore_len, ignore_contents)
 392           )
 393       if not Ctx().keep_cvsignore:
 394         return
 395
 396     self.dumpfile.write(
 397         'Node-path: %s\n'
 398         'Node-action: delete\n'
 399         '\n'
 400         % (self._utf8_path(lod.get_path(cvs_path.cvs_path)),)
 401         )
 402
 403   def copy_lod(self, src_lod, dest_lod, src_revnum):
 404     # Register the main LOD directory, and create parent directories
 405     # as needed:
 406     self._register_basic_directory(dest_lod.get_path(), False)
 407
 408     self.dumpfile.write(
 409         'Node-path: %s\n'
 410         'Node-kind: dir\n'
 411         'Node-action: add\n'
 412         'Node-copyfrom-rev: %d\n'
 413         'Node-copyfrom-path: %s\n'
 414         '\n'
 415         % (self._utf8_path(dest_lod.get_path()),
 416            src_revnum, self._utf8_path(src_lod.get_path()))
 417         )
 418
 419   def copy_path(self, cvs_path, src_lod, dest_lod, src_revnum):
 420     if isinstance(cvs_path, CVSFile):
 421       node_kind = 'file'
 422       if cvs_path.basename == '.cvsignore':
 423         # FIXME: Here we have to adjust the containing directory's
 424         # svn:ignore property to reflect the addition of the
 425         # .cvsignore file to the LOD!  This is awkward because we
 426         # don't have the contents of the .cvsignore file available.
 427         if not Ctx().keep_cvsignore:
 428           return
 429     elif isinstance(cvs_path, CVSDirectory):
 430       node_kind = 'dir'
 431     else:
 432       raise InternalError()
 433
 434     self.dumpfile.write(
 435         'Node-path: %s\n'
 436         'Node-kind: %s\n'
 437         'Node-action: add\n'
 438         'Node-copyfrom-rev: %d\n'
 439         'Node-copyfrom-path: %s\n'
 440         '\n'
 441         % (
 442             self._utf8_path(dest_lod.get_path(cvs_path.cvs_path)),
 443             node_kind,
 444             src_revnum,
 445             self._utf8_path(src_lod.get_path(cvs_path.cvs_path))
 446             )
 447         )
 448
 449   def finish(self):
 450     """Perform any cleanup necessary after all revisions have been
 451     committed."""
 452
 453     self.dumpfile.close()
 454
 455
 456 def generate_ignores(raw_ignore_val):
 457   ignore_vals = [ ]
 458   for ignore in raw_ignore_val.split():
 459     # Reset the list if we encounter a '!'
 460     # See http://cvsbook.red-bean.com/cvsbook.html#cvsignore
 461     if ignore == '!':
 462       ignore_vals = [ ]
 463     else:
 464       ignore_vals.append(ignore)
 465   return ignore_vals
 466
 467
 468 class LF_EOL_Filter:
 469   """Filter a stream and convert all end-of-line markers (CRLF, CR or LF)
 470   into the appropriate canonical eol style."""
 471
 472   eol_style_replacements = {
 473       'LF' : '\n',
 474       'CR' : '\r',
 475       'CRLF' : '\r\n',
 476       'native' : '\n',
 477       }
 478
 479   def __init__(self, stream, eol_style):
 480     self.stream = stream
 481     self.replacement = self.eol_style_replacements[eol_style]
 482     self.carry_cr = False
 483     self.eof = False
 484
 485   def read(self, size=-1):
 486     while True:
 487       buf = self.stream.read(size)
 488       self.eof = len(buf) == 0
 489       if self.carry_cr:
 490         buf = '\r' + buf
 491         self.carry_cr = False
 492       if not self.eof and buf[-1] == '\r':
 493         self.carry_cr = True
 494         buf = buf[:-1]
 495       buf = buf.replace('\r\n', '\n')
 496       buf = buf.replace('\r', '\n')
 497       if self.replacement != '\n':
 498         buf = buf.replace('\n', self.replacement)
 499       if buf or self.eof:
 500         return buf
 501
 502   def close(self):
 503     self.stream.close()
 504     self.stream = None
 505
 506