InitializeChangesetsPass: We are looking for the *biggest* gap.
[cvs2svn.git] / contrib / destroy_repository.py
blob0f42772b0f05abf428a10b23d7077fc5ebd986d1
1 #! /usr/bin/python
3 # (Be in -*- python -*- mode.)
5 # ====================================================================
6 # Copyright (c) 2006-2008 CollabNet. All rights reserved.
8 # This software is licensed as described in the file COPYING, which
9 # you should have received as part of this distribution. The terms
10 # are also available at http://subversion.tigris.org/license-1.html.
11 # If newer versions of this license are posted there, you may use a
12 # newer version instead, at your option.
14 # This software consists of voluntary contributions made by many
15 # individuals. For exact contribution history, see the revision
16 # history and logs, available at http://cvs2svn.tigris.org/.
17 # ====================================================================
19 """Usage: destroy_repository.py OPTION... PATH...
21 Strip the text content out of RCS-format files.
23 *** This script irretrievably destroys any RCS files that it is applied to!
25 This script attempts to strip the file text, log messages, and author
26 names out of RCS files, in addition to renaming RCS files and directories.
27 (This is useful to make test cases smaller and to remove much of the
28 proprietary information that is stored in a repository.) Note that this
29 script does NOT obliterate other information that might also be considered
30 proprietary, such as 'CVSROOT' directories and their contents, commit dates,
31 etc. In fact, it's not guaranteed even to obliterate all of the file text,
32 or to do anything else for that matter.
34 The following OPTIONs are recognized:
35 --all destroy all data (this is the default if no options are given)
36 --data destroy revision data (file contents) only
37 --metadata destroy revision metadata (author, log message, description) only
38 --symbols destroy symbol names (branch/tag names) only
39 --filenames destroy the filenames of RCS files
40 --basenames destroy basenames only (keep filename extensions, such as '.txt')
41 (--filenames overrides --basenames)
42 --dirnames destroy directory names within given PATH. PATH itself (if a
43 directory) is not destroyed.
44 --cvsroot delete files within 'CVSROOT' directories, instead of leaving
45 them untouched. The 'CVSROOT' directory itself is preserved.
46 --no-<X> where <X> is one of the above options negates the meaning of that
47 option.
49 Each PATH that is a *,v file will be stripped.
51 Each PATH that is a directory will be traversed and all of its *,v
52 files stripped.
54 Other PATHs will be ignored.
57 Examples of usage:
58 destroy_repository.py PATH
59 destroys all data in PATH
61 destroy_repository.py --all PATH
62 same as above
64 destroy_repository.py --data PATH
65 destroys only revision data
67 destroy_repository.py --no-data PATH
68 destroys everything but revision data
70 destroy_repository.py --data --metadata PATH
71 destroys revision data and metadata only
73 ---->8----
75 The *,v files must be writable by the user running the script.
76 Typically CVS repositories are read-only, so you might have to run
77 something like
79 $ chmod -R ug+w my/repo/path
81 before running this script.
83 Most cvs2svn behavior is completely independent of the text contained
84 in an RCS file. (The text is not even looked at until OutputPass.)
86 The idea is to use this script when preparing test cases for problems
87 that you experience with cvs2svn. Instead of sending us your whole
88 CVS repository, you should:
90 1. Make a copy of the original repository
92 2. Run this script on the copy (NEVER ON THE ORIGINAL!!!)
94 3. Verify that the problem still exists when you use cvs2svn to
95 convert the 'destroyed' copy
97 4. Send us the 'destroyed' copy along with the exact cvs2svn version
98 that you used, the exact command line that you used to start the
99 conversion, and the options file if you used one.
101 Please also consider using shrink_test_case.py to localize the problem
102 even further.
106 import sys
107 import os
108 import shutil
109 import re
111 sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
113 from cvs2svn_lib.key_generator import KeyGenerator
114 from cvs2svn_lib.rcsparser import parse
115 from rcs_file_filter import WriteRCSFileSink
116 from rcs_file_filter import FilterSink
119 # Which components to be destroyed. Default to all.
120 destroy = {
121 'data': True,
122 'metadata': True,
123 'symbols': True,
124 'filenames': True,
125 'basenames': True,
126 'dirnames': True,
127 'cvsroot': True,
130 tmpdir = 'destroy_repository-tmp'
132 file_key_generator = KeyGenerator(1)
134 def get_tmp_filename():
135 return os.path.join(tmpdir, 'f%07d.tmp' % file_key_generator.gen_id())
137 # Mapping from "real" symbol name to rewritten symbol name
138 symbol_map = {}
140 def rewrite_symbol(name):
141 if name not in symbol_map:
142 symbol_map[name] = "symbol%05d" % (len(symbol_map))
143 return symbol_map[name]
145 # Mapping from "real" filename to rewritten filename
146 filename_map = {
147 # Empty filename should always map to empty filename. This is useful when
148 # preserving the last component of filenames with only one component.
149 '': '',
152 # Set the following to true if we should not destroy the last filename
153 # component (aka. filename extension)
154 keep_last_filename_component = False
156 def rewrite_filename(pathname):
157 if not destroy['filenames']:
158 return pathname
159 (dirname, filename) = os.path.split(pathname)
160 extra = ''
162 # Strip trailing ',v' now, and re-append it to the rewritten filename
163 if filename.endswith(',v'):
164 extra += ',v'
165 filename = filename[:-2]
167 if keep_last_filename_component:
168 (filename, extension) = os.path.splitext(filename)
169 if not extension:
170 # filename has no extension. Do not rewrite this filename
171 # at all.
172 return pathname
173 extra = extension + extra
175 # Rewrite filename
176 try:
177 return os.path.join(dirname, filename_map[filename] + extra)
178 except KeyError:
179 # filename_map[filename] does not exist. Generate automatically:
180 num = len(filename_map)
181 while True:
182 filename_map[filename] = "file%03d" % (num)
183 retval = os.path.join(dirname, filename_map[filename] + extra)
184 if not os.path.exists(retval):
185 return retval
186 num += 1
188 # List of directory names to be renamed. This list is filled while we walk
189 # the directory structure, and then processed afterwards, in order to not
190 # mess up the directory structure while it is being walked.
191 rename_dir_list = []
193 def rename_dirs():
194 """Rename all directories occuring in rename_dir_list"""
195 # Make sure we rename subdirs _before_ renaming their parents
196 rename_dir_list.reverse()
197 rename_map = {}
198 num = 0
199 for d in rename_dir_list:
200 (parent, name) = os.path.split(d)
201 # Skip rewriting 'Attic' directories
202 if name == "Attic":
203 continue
204 if name not in rename_map:
205 while True:
206 num += 1
207 rename_map[name] = "dir%03d" % (num)
208 if not os.path.exists(os.path.join(parent, rename_map[name])):
209 break
210 new_d = os.path.join(parent, rename_map[name])
211 assert not os.path.exists(new_d)
212 shutil.move(d, new_d)
215 class Substituter:
216 def __init__(self, template):
217 self.template = template
218 self.key_generator = KeyGenerator(1)
220 # A map from old values to new ones.
221 self.substitutions = {}
223 def get_substitution(self, s):
224 r = self.substitutions.get(s)
225 if r == None:
226 r = self.template % self.key_generator.gen_id()
227 self.substitutions[s] = r
228 return r
231 class LogSubstituter(Substituter):
232 # If a log messages matches any of these regular expressions, it
233 # is passed through untouched.
234 untouchable_log_res = [
235 re.compile(r'^Initial revision\n$'),
236 re.compile(r'^file (?P<filename>.+) was initially added'
237 r' on branch (?P<symbol>.+)\.\n$'),
238 re.compile(r'^\*\*\* empty log message \*\*\*\n$'),
239 re.compile(r'^initial checkin$'),
242 def __init__(self):
243 Substituter.__init__(self, 'log %d')
245 def get_substitution(self, log):
246 keep_log = ''
247 for untouchable_log_re in self.untouchable_log_res:
248 m = untouchable_log_re.search(log)
249 if m:
250 # We have matched one of the above regexps
251 # Keep log message
252 keep_log = log
253 # Check if we matched a regexp with a named subgroup
254 groups = m.groupdict()
255 if 'symbol' in groups and destroy['symbols']:
256 # Need to rewrite symbol name
257 symbol = groups['symbol']
258 keep_log = keep_log.replace(symbol, rewrite_symbol(symbol))
259 if 'filename' in groups and destroy['filenames']:
260 # Need to rewrite filename
261 filename = groups['filename']
262 keep_log = keep_log.replace(
263 filename, rewrite_filename(filename)
265 if keep_log:
266 return keep_log
267 if destroy['metadata']:
268 return Substituter.get_substitution(self, log)
269 return log
272 class DestroyerFilterSink(FilterSink):
273 def __init__(self, author_substituter, log_substituter, sink):
274 FilterSink.__init__(self, sink)
276 self.author_substituter = author_substituter
277 self.log_substituter = log_substituter
279 def set_head_revision(self, revision):
280 self.head_revision = revision
281 FilterSink.set_head_revision(self, revision)
283 def define_tag(self, name, revision):
284 if destroy['symbols']:
285 name = rewrite_symbol(name)
286 FilterSink.define_tag(self, name, revision)
288 def define_revision(
289 self, revision, timestamp, author, state, branches, next
291 if destroy['metadata']:
292 author = self.author_substituter.get_substitution(author)
293 FilterSink.define_revision(
294 self, revision, timestamp, author, state, branches, next
297 def set_description(self, description):
298 if destroy['metadata']:
299 description = ''
300 FilterSink.set_description(self, description)
302 def set_revision_info(self, revision, log, text):
303 if destroy['data']:
304 if revision == self.head_revision:
305 # Set the HEAD text unconditionally. (It could be
306 # that revision HEAD-1 has an empty deltatext, in
307 # which case the HEAD text was actually committed in
308 # an earlier commit.)
309 text = (
310 'This text was last seen in HEAD (revision %s)\n'
311 ) % (revision,)
312 elif text == '':
313 # This is a no-op revision; preserve that fact. (It
314 # might be relied on by cvs2svn).
315 pass
316 else:
317 # Otherwise, replace the data.
318 if revision.count('.') == 1:
319 # On trunk, it could be that revision N-1 has an
320 # empty deltatext, in which case text for revision
321 # N was actually committed in an earlier commit.
322 text = (
323 'd1 1\n'
324 'a1 1\n'
325 'This text was last seen in revision %s\n'
326 ) % (revision,)
327 else:
328 # On a branch, we know that the text was changed
329 # in revision N (even though the same text might
330 # also be kept across later revisions N+1 etc.)
331 text = (
332 'd1 1\n'
333 'a1 1\n'
334 'This text was committed in revision %s\n'
335 ) % (revision,)
336 if destroy['metadata'] or destroy['symbols'] or destroy['filenames']:
337 log = self.log_substituter.get_substitution(log)
338 FilterSink.set_revision_info(self, revision, log, text)
341 class FileDestroyer:
342 def __init__(self):
343 self.log_substituter = LogSubstituter()
344 self.author_substituter = Substituter('author%d')
346 def destroy_file(self, filename):
347 tmp_filename = get_tmp_filename()
348 f = open(tmp_filename, 'wb')
349 new_filename = rewrite_filename(filename)
350 oldf = open(filename, 'rb')
351 parse(
352 oldf,
353 DestroyerFilterSink(
354 self.author_substituter,
355 self.log_substituter,
356 WriteRCSFileSink(f),
359 oldf.close()
360 f.close()
362 # Replace the original file with the new one:
363 assert filename == new_filename or not os.path.exists(new_filename)
364 os.remove(filename)
365 shutil.move(tmp_filename, new_filename)
367 def visit(self, dirname, names):
368 # Special handling of CVSROOT directories
369 if "CVSROOT" in names:
370 path = os.path.join(dirname, "CVSROOT")
371 if destroy['cvsroot']:
372 # Remove all contents within CVSROOT
373 sys.stderr.write('Deleting %s contents...' % path)
374 shutil.rmtree(path)
375 os.mkdir(path)
376 else:
377 # Leave CVSROOT alone
378 sys.stderr.write('Skipping %s...' % path)
379 del names[names.index("CVSROOT")]
380 sys.stderr.write('done.\n')
381 for name in names:
382 path = os.path.join(dirname, name)
383 if os.path.isfile(path) and path.endswith(',v'):
384 sys.stderr.write('Destroying %s...' % path)
385 self.destroy_file(path)
386 sys.stderr.write('done.\n')
387 elif os.path.isdir(path):
388 if destroy['dirnames']:
389 rename_dir_list.append(path)
390 # Subdirectories are traversed automatically
391 pass
392 else:
393 sys.stderr.write('File %s is being ignored.\n' % path)
395 def destroy_dir(self, path):
396 os.path.walk(path, FileDestroyer.visit, self)
399 def usage_abort(msg):
400 if msg:
401 print >>sys.stderr, "ERROR:", msg
402 print >>sys.stderr
403 # Use this file's docstring as a usage string, but only the first part
404 print __doc__.split('\n---->8----', 1)[0]
405 sys.exit(1)
407 if __name__ == '__main__':
408 if not os.path.isdir(tmpdir):
409 os.makedirs(tmpdir)
411 # Paths to be destroyed
412 paths = []
414 # Command-line argument processing
415 first_option = True
416 for arg in sys.argv[1:]:
417 if arg.startswith("--"):
418 # Option processing
419 option = arg[2:].lower()
420 value = True
421 if option.startswith("no-"):
422 value = False
423 option = option[3:]
424 if first_option:
425 # Use the first option on the command-line to determine the
426 # default actions. If the first option is negated (i.e. --no-X)
427 # the default action should be to destroy everything.
428 # Otherwise, the default action should be to destroy nothing.
429 # This makes both positive and negative options work
430 # intuitively (e.g. "--data" will destroy only data, while
431 # "--no-data" will destroy everything BUT data).
432 for d in destroy.keys():
433 destroy[d] = not value
434 first_option = False
435 if option in destroy:
436 destroy[option] = value
437 elif option == "all":
438 for d in destroy.keys():
439 destroy[d] = value
440 else:
441 usage_abort("Unknown OPTION '%s'" % arg)
442 else:
443 # Path argument
444 paths.append(arg)
446 # If --basenames if given (and not also --filenames), we shall destroy
447 # filenames, up to, but not including the last component.
448 if destroy['basenames'] and not destroy['filenames']:
449 destroy['filenames'] = True
450 keep_last_filename_component = True
452 if not paths:
453 usage_abort("No PATH given")
455 # Destroy given PATHs
456 file_destroyer = FileDestroyer()
457 for path in paths:
458 if os.path.isfile(path) and path.endswith(',v'):
459 file_destroyer.destroy_file(path)
460 elif os.path.isdir(path):
461 file_destroyer.destroy_dir(path)
462 else:
463 sys.stderr.write('PATH %s is being ignored.\n' % path)
465 if destroy['dirnames']:
466 rename_dirs()