generate_blobs.py: Be careful if deleting revision 1.1.
[cvs2svn.git] / cvs2svn_lib / generate_blobs.py
blob38e890559de5debf64ebaddc6bfec866994f8cd0
1 #!/usr/bin/env python -u
2 # (Be in -*- python -*- mode.)
4 # ====================================================================
5 # Copyright (c) 2009-2010 CollabNet. All rights reserved.
7 # This software is licensed as described in the file COPYING, which
8 # you should have received as part of this distribution. The terms
9 # are also available at http://subversion.tigris.org/license-1.html.
10 # If newer versions of this license are posted there, you may use a
11 # newer version instead, at your option.
13 # This software consists of voluntary contributions made by many
14 # individuals. For exact contribution history, see the revision
15 # history and logs, available at http://cvs2svn.tigris.org/.
16 # ====================================================================
18 """Generate git blobs directly from RCS files.
20 Usage: generate_blobs.py BLOBFILE
22 To standard input should be written a series of pickles, each of which
23 contains the following tuple:
25 (RCSFILE, {CVS_REV : MARK, ...})
27 indicating which RCS file to read, which CVS revisions should be
28 written to the blob file, and which marks to give each of the blobs.
30 Since the tuples are read from stdin, either the calling program has
31 to write to this program's stdin in binary mode and ensure that this
32 program's standard input is opened in binary mode (e.g., using
33 Python's '-u' option) or both can be in text mode *provided* that
34 pickle protocol 0 is used.
36 The program does most of its work in RAM, keeping at most one revision
37 fulltext and one revision deltatext (plus perhaps one or two copies as
38 scratch space) in memory at a time. But there are times when the
39 fulltext of a revision is needed multiple times, for example when
40 multiple branches sprout from the revision. In these cases, the
41 fulltext is written to disk. If the fulltext is also needed for the
42 blobfile, then the copy in the blobfile is read again when it is
43 needed. If the fulltext is not needed in the blobfile, then it is
44 written to a temporary file created with Python's tempfile module."""
46 import sys
47 import os
48 import tempfile
49 import cPickle as pickle
51 sys.path.insert(0, os.path.dirname(os.path.dirname(sys.argv[0])))
53 from cvs2svn_lib.rcsparser import Sink
54 from cvs2svn_lib.rcsparser import parse
55 from cvs2svn_lib.rcs_stream import RCSStream
58 def read_marks():
59 # A map from CVS revision number (e.g., 1.2.3.4) to mark:
60 marks = {}
61 for l in sys.stdin:
62 [rev, mark] = l.strip().split()
63 marks[rev] = mark
65 return marks
68 class RevRecord(object):
69 def __init__(self, rev, mark=None):
70 self.rev = rev
71 self.mark = mark
73 # The rev whose fulltext is the base for this one's delta.
74 self.base = None
76 # Other revs that refer to this one as their base text:
77 self.refs = set()
79 # The (f, offset, length) where the fulltext of this revision can
80 # be found:
81 self.fulltext = None
83 def is_needed(self):
84 return bool(self.mark is not None or self.refs)
86 def is_written(self):
87 return self.fulltext is not None
89 def write_blob(self, f, text):
90 f.seek(0, 2)
91 length = len(text)
92 f.write('blob\n')
93 f.write('mark :%s\n' % (self.mark,))
94 f.write('data %d\n' % (length,))
95 offset = f.tell()
96 f.write(text)
97 f.write('\n')
99 self.fulltext = (f, offset, length)
101 # This record (with its mark) has now been written, so the mark is
102 # no longer needed. Setting it to None might allow is_needed() to
103 # become False:
104 self.mark = None
106 def write(self, f, text):
107 f.seek(0, 2)
108 offset = f.tell()
109 length = len(text)
110 f.write(text)
111 self.fulltext = (f, offset, length)
113 def read_fulltext(self):
114 assert self.fulltext is not None
115 (f, offset, length) = self.fulltext
116 f.seek(offset)
117 return f.read(length)
119 def __str__(self):
120 if self.mark is not None:
121 return '%s (%r): %r, %s' % (
122 self.rev, self.mark, self.refs, self.fulltext is not None,
124 else:
125 return '%s: %r, %s' % (self.rev, self.refs, self.fulltext is not None)
128 class WriteBlobSink(Sink):
129 def __init__(self, blobfile, marks):
130 self.blobfile = blobfile
132 # A map {rev : RevRecord} for all of the revisions whose fulltext
133 # will still be needed:
134 self.revrecs = {}
136 # The revisions that need marks will definitely be needed, so
137 # create records for them now (the rest will be filled in while
138 # reading the RCS file):
139 for (rev, mark) in marks.items():
140 self.revrecs[rev] = RevRecord(rev, mark)
142 # The RevRecord of the last fulltext that has been reconstructed,
143 # if it still is_needed():
144 self.last_revrec = None
145 # An RCSStream holding the fulltext of last_revrec:
146 self.last_rcsstream = None
148 # A file to temporarily hold the fulltexts of revisions for which
149 # no blobs are needed:
150 self.fulltext_file = tempfile.TemporaryFile()
152 def __getitem__(self, rev):
153 try:
154 return self.revrecs[rev]
155 except KeyError:
156 revrec = RevRecord(rev)
157 self.revrecs[rev] = revrec
158 return revrec
160 def define_revision(self, rev, timestamp, author, state, branches, next):
161 revrec = self[rev]
163 if next is not None:
164 revrec.refs.add(next)
166 revrec.refs.update(branches)
168 for dependent_rev in revrec.refs:
169 dependent_revrec = self[dependent_rev]
170 assert dependent_revrec.base is None
171 dependent_revrec.base = rev
173 def tree_completed(self):
174 """Remove unneeded RevRecords.
176 Remove the RevRecords for any revisions whose fulltext will not be
177 needed (neither as blob output nor as the base of another needed
178 revision)."""
180 revrecs_to_remove = [
181 revrec
182 for revrec in self.revrecs.itervalues()
183 if not revrec.is_needed()
185 while revrecs_to_remove:
186 revrec = revrecs_to_remove.pop()
187 del self.revrecs[revrec.rev]
188 if revrec.base is not None:
189 base_revrec = self[revrec.base]
190 base_revrec.refs.remove(revrec.rev)
191 if not base_revrec.is_needed():
192 revrecs_to_remove.append(base_revrec)
194 def set_revision_info(self, rev, log, text):
195 revrec = self.revrecs.get(rev)
197 if revrec is None:
198 return
200 base_rev = revrec.base
201 if base_rev is None:
202 # This must be the last revision on trunk, for which the
203 # fulltext is stored directly in the RCS file:
204 assert self.last_revrec is None
205 if revrec.mark is not None:
206 revrec.write_blob(self.blobfile, text)
207 if revrec.is_needed():
208 self.last_revrec = revrec
209 self.last_rcsstream = RCSStream(text)
210 elif self.last_revrec is not None and base_rev == self.last_revrec.rev:
211 # Our base revision is stored in self.last_rcsstream.
212 self.last_revrec.refs.remove(rev)
213 if self.last_revrec.is_needed():
214 if not self.last_revrec.is_written():
215 self.last_revrec.write(
216 self.fulltext_file, self.last_rcsstream.get_text()
218 self.last_rcsstream.apply_diff(text)
219 if revrec.mark is not None:
220 revrec.write_blob(self.blobfile, self.last_rcsstream.get_text())
221 if revrec.is_needed():
222 self.last_revrec = revrec
223 else:
224 self.last_revrec = None
225 self.last_rcsstream = None
226 else:
227 # Our base revision is not stored in self.last_rcsstream; it
228 # will have to be obtained from elsewhere.
230 # Store the old last_rcsstream if necessary:
231 if self.last_revrec is not None:
232 if not self.last_revrec.is_written():
233 self.last_revrec.write(
234 self.fulltext_file, self.last_rcsstream.get_text()
236 self.last_revrec = None
237 self.last_rcsstream = None
239 base_revrec = self[base_rev]
240 rcsstream = RCSStream(base_revrec.read_fulltext())
241 base_revrec.refs.remove(rev)
242 rcsstream.apply_diff(text)
243 if revrec.mark is not None:
244 revrec.write_blob(self.blobfile, rcsstream.get_text())
245 if revrec.is_needed():
246 self.last_revrec = revrec
247 self.last_rcsstream = rcsstream
248 del rcsstream
250 def parse_completed(self):
251 self.fulltext_file.close()
254 def main(args):
255 [blobfilename] = args
256 blobfile = open(blobfilename, 'w+b')
257 while True:
258 try:
259 (rcsfile, marks) = pickle.load(sys.stdin)
260 except EOFError:
261 break
262 parse(open(rcsfile, 'rb'), WriteBlobSink(blobfile, marks))
264 blobfile.close()
267 if __name__ == '__main__':
268 main(sys.argv[1:])