Read the whole rev stream before converting EOL styles.
[cvs2svn.git] / cvs2svn_lib / dumpfile_delegate.py
blobd51094c1b1215ce2b166b72bfe8ead109480902d
1 # (Be in -*- python -*- mode.)
3 # ====================================================================
4 # Copyright (c) 2000-2009 CollabNet. All rights reserved.
6 # This software is licensed as described in the file COPYING, which
7 # you should have received as part of this distribution. The terms
8 # are also available at http://subversion.tigris.org/license-1.html.
9 # If newer versions of this license are posted there, you may use a
10 # newer version instead, at your option.
12 # This software consists of voluntary contributions made by many
13 # individuals. For exact contribution history, see the revision
14 # history and logs, available at http://cvs2svn.tigris.org/.
15 # ====================================================================
17 """This module contains database facilities used by cvs2svn."""
20 try:
21 from hashlib import md5
22 except ImportError:
23 from md5 import new as md5
26 from cvs2svn_lib import config
27 from cvs2svn_lib.common import FatalError
28 from cvs2svn_lib.common import InternalError
29 from cvs2svn_lib.common import path_split
30 from cvs2svn_lib.context import Ctx
31 from cvs2svn_lib.cvs_path import CVSDirectory
32 from cvs2svn_lib.cvs_path import CVSFile
33 from cvs2svn_lib.svn_repository_delegate import SVNRepositoryDelegate
34 from cvs2svn_lib.apple_single_filter import get_maybe_apple_single_stream
37 # Things that can happen to a file.
38 OP_ADD = 'add'
39 OP_CHANGE = 'change'
42 # A mapping from the value of the svn:eol-style property to the EOL
43 # string that should appear in a dumpfile:
44 EOL_STYLE_REPLACEMENTS = {
45 'LF' : '\n',
46 'CR' : '\r',
47 'CRLF' : '\r\n',
48 'native' : '\n',
52 def canonicalize_eol(text, eol):
53 """Replace any end-of-line sequences in TEXT with the string EOL."""
55 text = text.replace('\r\n', '\n')
56 text = text.replace('\r', '\n')
57 if eol != '\n':
58 text = text.replace('\n', eol)
59 return text
62 class DumpfileDelegate(SVNRepositoryDelegate):
63 """Create a Subversion dumpfile."""
65 def __init__(self, revision_reader, dumpfile_path):
66 """Return a new DumpfileDelegate instance, attached to a dumpfile
67 DUMPFILE_PATH, using Ctx().cvs_filename_decoder()."""
69 self._revision_reader = revision_reader
70 self.dumpfile_path = dumpfile_path
72 self.dumpfile = open(self.dumpfile_path, 'wb')
73 self._write_dumpfile_header(self.dumpfile)
75 # A set of the basic project infrastructure project directories
76 # that have been created so far, as SVN paths. (The root
77 # directory is considered to be present at initialization.) This
78 # includes all of the LOD paths, and all of their parent
79 # directories etc.
80 self._basic_directories = set([''])
82 def _write_dumpfile_header(self, dumpfile):
83 # Initialize the dumpfile with the standard headers.
85 # Since the CVS repository doesn't have a UUID, and the Subversion
86 # repository will be created with one anyway, we don't specify a
87 # UUID in the dumpflie
88 dumpfile.write('SVN-fs-dump-format-version: 2\n\n')
90 def _utf8_path(self, path):
91 """Return a copy of PATH encoded in UTF-8."""
93 # Convert each path component separately (as they may each use
94 # different encodings).
95 try:
96 return '/'.join([
97 Ctx().cvs_filename_decoder(piece).encode('utf8')
98 for piece in path.split('/')
100 except UnicodeError:
101 raise FatalError(
102 "Unable to convert a path '%s' to internal encoding.\n"
103 "Consider rerunning with one or more '--encoding' parameters or\n"
104 "with '--fallback-encoding'."
105 % (path,))
107 def _string_for_prop(self, name, value):
108 """Return a property in the form needed for the dumpfile."""
110 return 'K %d\n%s\nV %d\n%s\n' % (len(name), name, len(value), value)
112 def start_commit(self, revnum, revprops):
113 """Emit the start of SVN_COMMIT (an SVNCommit)."""
115 self.revision = revnum
117 # The start of a new commit typically looks like this:
119 # Revision-number: 1
120 # Prop-content-length: 129
121 # Content-length: 129
123 # K 7
124 # svn:log
125 # V 27
126 # Log message for revision 1.
127 # K 10
128 # svn:author
129 # V 7
130 # jrandom
131 # K 8
132 # svn:date
133 # V 27
134 # 2003-04-22T22:57:58.132837Z
135 # PROPS-END
137 # Notice that the length headers count everything -- not just the
138 # length of the data but also the lengths of the lengths, including
139 # the 'K ' or 'V ' prefixes.
141 # The reason there are both Prop-content-length and Content-length
142 # is that the former includes just props, while the latter includes
143 # everything. That's the generic header form for any entity in a
144 # dumpfile. But since revisions only have props, the two lengths
145 # are always the same for revisions.
147 # Calculate the output needed for the property definitions.
148 prop_names = revprops.keys()
149 prop_names.sort()
150 prop_strings = []
151 for propname in prop_names:
152 if revprops[propname] is not None:
153 prop_strings.append(
154 self._string_for_prop(propname, revprops[propname]))
156 all_prop_strings = ''.join(prop_strings) + 'PROPS-END\n'
157 total_len = len(all_prop_strings)
159 # Print the revision header and revprops
160 self.dumpfile.write(
161 'Revision-number: %d\n'
162 'Prop-content-length: %d\n'
163 'Content-length: %d\n'
164 '\n'
165 '%s'
166 '\n'
167 % (self.revision, total_len, total_len, all_prop_strings)
170 def end_commit(self):
171 pass
173 def _make_any_dir(self, path):
174 """Emit the creation of directory PATH."""
176 self.dumpfile.write(
177 "Node-path: %s\n"
178 "Node-kind: dir\n"
179 "Node-action: add\n"
180 "\n"
181 "\n"
182 % self._utf8_path(path)
185 def _register_basic_directory(self, path, create):
186 """Register the creation of PATH if it is not already there.
188 Create any parent directories that do not already exist. If
189 CREATE is set, also create PATH if it doesn't already exist. This
190 method should only be used for the LOD paths and the directories
191 containing them, not for directories within an LOD path."""
193 if path not in self._basic_directories:
194 # Make sure that the parent directory is present:
195 self._register_basic_directory(path_split(path)[0], True)
196 if create:
197 self._make_any_dir(path)
198 self._basic_directories.add(path)
200 def initialize_project(self, project):
201 """Create any initial directories for the project.
203 The trunk, tags, and branches directories directories are created
204 the first time the project is seen. Be sure not to create parent
205 directories that already exist (e.g., because two directories
206 share part of their paths either within or across projects)."""
208 for path in project.get_initial_directories():
209 self._register_basic_directory(path, True)
211 def initialize_lod(self, lod):
212 lod_path = lod.get_path()
213 if lod_path:
214 self._register_basic_directory(lod_path, True)
216 def mkdir(self, lod, cvs_directory):
217 self._make_any_dir(lod.get_path(cvs_directory.cvs_path))
219 def _add_or_change_path(self, cvs_rev, op):
220 """Emit the addition or change corresponding to CVS_REV.
222 OP is either the constant OP_ADD or OP_CHANGE."""
224 assert op in [OP_ADD, OP_CHANGE]
226 # The property handling here takes advantage of an undocumented
227 # but IMHO consistent feature of the Subversion dumpfile-loading
228 # code. When a node's properties aren't mentioned (that is, the
229 # "Prop-content-length:" header is absent, no properties are
230 # listed at all, and there is no "PROPS-END\n" line) then no
231 # change is made to the node's properties.
233 # This is consistent with the way dumpfiles behave w.r.t. text
234 # content changes, so I'm comfortable relying on it. If you
235 # commit a change to *just* the properties of some node that
236 # already has text contents from a previous revision, then in the
237 # dumpfile output for the prop change, no "Text-content-length:"
238 # nor "Text-content-md5:" header will be present, and the text of
239 # the file will not be given. But this does not cause the file's
240 # text to be erased! It simply remains unchanged.
242 # This works out great for cvs2svn, due to lucky coincidences:
244 # For files, we set most properties in the first revision and
245 # never change them. (The only exception is the 'cvs2svn:cvs-rev'
246 # property.) If 'cvs2svn:cvs-rev' is not being used, then there
247 # is no need to remember the full set of properties on a given
248 # file once we've set it.
250 # For directories, the only property we set is "svn:ignore", and
251 # while we may change it after the first revision, we always do so
252 # based on the contents of a ".cvsignore" file -- in other words,
253 # CVS is doing the remembering for us, so we still don't have to
254 # preserve the previous value of the property ourselves.
256 # Calculate the (sorted-by-name) property string and length, if any.
257 svn_props = cvs_rev.get_properties()
258 if cvs_rev.properties_changed:
259 prop_contents = ''
260 prop_names = svn_props.keys()
261 prop_names.sort()
262 for pname in prop_names:
263 pvalue = svn_props[pname]
264 prop_contents += self._string_for_prop(pname, pvalue)
265 prop_contents += 'PROPS-END\n'
266 props_header = 'Prop-content-length: %d\n' % len(prop_contents)
267 else:
268 prop_contents = ''
269 props_header = ''
271 # If the file has keywords, we must prevent CVS/RCS from expanding
272 # the keywords because they must be unexpanded in the repository,
273 # or Subversion will get confused.
274 has_keywords = bool(cvs_rev.get_properties().get('svn:keywords', None))
275 stream = self._revision_reader.get_content_stream(
276 cvs_rev, suppress_keyword_substitution=has_keywords
279 if Ctx().decode_apple_single:
280 # Insert a filter to decode any files that are in AppleSingle
281 # format:
282 stream = get_maybe_apple_single_stream(stream)
284 data = stream.read()
285 stream.close()
287 # Convert all EOLs to LFs if neccessary
288 eol_style = svn_props.get('svn:eol-style', None)
289 if eol_style:
290 eol = EOL_STYLE_REPLACEMENTS[eol_style]
291 data = canonicalize_eol(data, eol)
293 # treat .cvsignore as a directory property
294 dir_path, basename = path_split(cvs_rev.get_svn_path())
295 if basename == '.cvsignore':
296 ignore_vals = generate_ignores(data)
297 ignore_contents = '\n'.join(ignore_vals)
298 if ignore_contents:
299 ignore_contents += '\n'
300 ignore_contents = ('K 10\nsvn:ignore\nV %d\n%s\n' % \
301 (len(ignore_contents), ignore_contents))
302 ignore_contents += 'PROPS-END\n'
303 ignore_len = len(ignore_contents)
305 # write headers, then props
306 self.dumpfile.write(
307 'Node-path: %s\n'
308 'Node-kind: dir\n'
309 'Node-action: change\n'
310 'Prop-content-length: %d\n'
311 'Content-length: %d\n'
312 '\n'
313 '%s'
314 % (self._utf8_path(dir_path),
315 ignore_len, ignore_len, ignore_contents)
317 if not Ctx().keep_cvsignore:
318 return
320 self.dumpfile.write(
321 'Node-path: %s\n'
322 'Node-kind: file\n'
323 'Node-action: %s\n'
324 '%s' # no property header if no props
325 % (self._utf8_path(cvs_rev.get_svn_path()), op, props_header)
328 pos = self.dumpfile.tell()
330 content_header_fmt = (
331 'Text-content-length: %16d\n'
332 'Text-content-md5: %32s\n'
333 'Content-length: %16d\n'
334 '\n'
337 self.dumpfile.write(content_header_fmt % (0, '', 0,))
339 if prop_contents:
340 self.dumpfile.write(prop_contents)
342 # Insert the rev contents, calculating length and checksum.
343 checksum = md5()
344 checksum.update(data)
345 length = len(data)
346 self.dumpfile.write(data)
348 # Go back to overwrite the length and checksum headers with the
349 # correct values. The content length is the length of property
350 # data, text data, and any metadata around/inside around them:
351 self.dumpfile.seek(pos, 0)
352 self.dumpfile.write(
353 content_header_fmt
354 % (length, checksum.hexdigest(), length + len(prop_contents),)
357 # Jump back to the end of the stream
358 self.dumpfile.seek(0, 2)
360 # This record is done (write two newlines -- one to terminate
361 # contents that weren't themselves newline-termination, one to
362 # provide a blank line for readability.
363 self.dumpfile.write('\n\n')
365 def add_path(self, cvs_rev):
366 """Emit the addition corresponding to CVS_REV, a CVSRevisionAdd."""
368 self._add_or_change_path(cvs_rev, OP_ADD)
370 def change_path(self, cvs_rev):
371 """Emit the change corresponding to CVS_REV, a CVSRevisionChange."""
373 self._add_or_change_path(cvs_rev, OP_CHANGE)
375 def delete_lod(self, lod):
376 """Emit the deletion of LOD."""
378 self.dumpfile.write(
379 'Node-path: %s\n'
380 'Node-action: delete\n'
381 '\n'
382 % (self._utf8_path(lod.get_path()),)
384 self._basic_directories.remove(lod.get_path())
386 def delete_path(self, lod, cvs_path):
387 dir_path, basename = path_split(lod.get_path(cvs_path.get_cvs_path()))
388 if basename == '.cvsignore':
389 # When a .cvsignore file is deleted, the directory's svn:ignore
390 # property needs to be deleted.
391 ignore_contents = 'PROPS-END\n'
392 ignore_len = len(ignore_contents)
394 # write headers, then props
395 self.dumpfile.write(
396 'Node-path: %s\n'
397 'Node-kind: dir\n'
398 'Node-action: change\n'
399 'Prop-content-length: %d\n'
400 'Content-length: %d\n'
401 '\n'
402 '%s'
403 % (self._utf8_path(dir_path),
404 ignore_len, ignore_len, ignore_contents)
406 if not Ctx().keep_cvsignore:
407 return
409 self.dumpfile.write(
410 'Node-path: %s\n'
411 'Node-action: delete\n'
412 '\n'
413 % (self._utf8_path(lod.get_path(cvs_path.cvs_path)),)
416 def copy_lod(self, src_lod, dest_lod, src_revnum):
417 # Register the main LOD directory, and create parent directories
418 # as needed:
419 self._register_basic_directory(dest_lod.get_path(), False)
421 self.dumpfile.write(
422 'Node-path: %s\n'
423 'Node-kind: dir\n'
424 'Node-action: add\n'
425 'Node-copyfrom-rev: %d\n'
426 'Node-copyfrom-path: %s\n'
427 '\n'
428 % (self._utf8_path(dest_lod.get_path()),
429 src_revnum, self._utf8_path(src_lod.get_path()))
432 def copy_path(self, cvs_path, src_lod, dest_lod, src_revnum):
433 if isinstance(cvs_path, CVSFile):
434 node_kind = 'file'
435 if cvs_path.basename == '.cvsignore':
436 # FIXME: Here we have to adjust the containing directory's
437 # svn:ignore property to reflect the addition of the
438 # .cvsignore file to the LOD! This is awkward because we
439 # don't have the contents of the .cvsignore file available.
440 if not Ctx().keep_cvsignore:
441 return
442 elif isinstance(cvs_path, CVSDirectory):
443 node_kind = 'dir'
444 else:
445 raise InternalError()
447 self.dumpfile.write(
448 'Node-path: %s\n'
449 'Node-kind: %s\n'
450 'Node-action: add\n'
451 'Node-copyfrom-rev: %d\n'
452 'Node-copyfrom-path: %s\n'
453 '\n'
455 self._utf8_path(dest_lod.get_path(cvs_path.cvs_path)),
456 node_kind,
457 src_revnum,
458 self._utf8_path(src_lod.get_path(cvs_path.cvs_path))
462 def finish(self):
463 """Perform any cleanup necessary after all revisions have been
464 committed."""
466 self.dumpfile.close()
469 def generate_ignores(raw_ignore_val):
470 ignore_vals = [ ]
471 for ignore in raw_ignore_val.split():
472 # Reset the list if we encounter a '!'
473 # See http://cvsbook.red-bean.com/cvsbook.html#cvsignore
474 if ignore == '!':
475 ignore_vals = [ ]
476 else:
477 ignore_vals.append(ignore)
478 return ignore_vals