A program for generating blobs externally to main cvs2git program.
[cvs2svn.git] / cvs2svn_lib / generate_blobs.py
blobd2051acc745eba5079412034263ef8e085fdae6c
1 #!/usr/bin/env python -u
2 # (Be in -*- python -*- mode.)
4 # ====================================================================
5 # Copyright (c) 2009-2010 CollabNet. All rights reserved.
7 # This software is licensed as described in the file COPYING, which
8 # you should have received as part of this distribution. The terms
9 # are also available at http://subversion.tigris.org/license-1.html.
10 # If newer versions of this license are posted there, you may use a
11 # newer version instead, at your option.
13 # This software consists of voluntary contributions made by many
14 # individuals. For exact contribution history, see the revision
15 # history and logs, available at http://cvs2svn.tigris.org/.
16 # ====================================================================
18 """Generate git blobs directly from RCS files.
20 Usage: generate_blobs.py BLOBFILE
22 To standard input should be written a series of pickles, each of which
23 contains the following tuple:
25 (RCSFILE, {CVS_REV : MARK, ...})
27 indicating which RCS file to read, which CVS revisions should be
28 written to the blob file, and which marks to give each of the blobs.
30 Since the tuples are read from stdin, either the calling program has
31 to write to this program's stdin in binary mode and ensure that this
32 program's standard input is opened in binary mode (e.g., using
33 Python's '-u' option) or both can be in text mode *provided* that
34 pickle protocol 0 is used.
36 The program does most of its work in RAM, keeping at most one revision
37 fulltext and one revision deltatext (plus perhaps one or two copies as
38 scratch space) in memory at a time. But there are times when the
39 fulltext of a revision is needed multiple times, for example when
40 multiple branches sprout from the revision. In these cases, the
41 fulltext is written to disk. If the fulltext is also needed for the
42 blobfile, then the copy in the blobfils is read again when it is
43 needed. If the fulltext is not needed in the blobfile, then it is
44 written to a temporary file created with Python's tempfile module."""
46 import sys
47 import os
48 import tempfile
49 import cPickle as pickle
51 sys.path.insert(0, os.path.dirname(os.path.dirname(sys.argv[0])))
53 from cvs2svn_rcsparse import Sink
54 from cvs2svn_rcsparse import parse
55 from cvs2svn_lib.rcs_stream import RCSStream
58 def read_marks():
59 # A map from CVS revision number (e.g., 1.2.3.4) to mark:
60 marks = {}
61 for l in sys.stdin:
62 [rev, mark] = l.strip().split()
63 marks[rev] = mark
65 return marks
68 class RevRecord(object):
69 def __init__(self, rev, mark=None):
70 self.rev = rev
71 self.mark = mark
73 # The rev whose fulltext is the base for this one's delta.
74 self.base = None
76 # Other revs that refer to this one as their base text:
77 self.refs = set()
79 # The (f, offset, length) where the fulltext of this revision can
80 # be found:
81 self.fulltext = None
83 def is_needed(self):
84 return bool(self.mark is not None or self.refs)
86 def is_written(self):
87 return self.fulltext is not None
89 def write_blob(self, f, text):
90 f.seek(0, 2)
91 length = len(text)
92 f.write('blob\n')
93 f.write('mark :%s\n' % (self.mark,))
94 f.write('data %d\n' % (length,))
95 offset = f.tell()
96 f.write(text)
97 f.write('\n')
99 self.fulltext = (f, offset, length)
101 # This record (with its mark) has now been written, so the mark is
102 # no longer needed. Setting it to None might allow is_needed() to
103 # become False:
104 self.mark = None
106 def write(self, f, text):
107 f.seek(0, 2)
108 offset = f.tell()
109 length = len(text)
110 f.write(text)
111 self.fulltext = (f, offset, length)
113 def read_fulltext(self):
114 assert self.fulltext is not None
115 (f, offset, length) = self.fulltext
116 f.seek(offset)
117 return f.read(length)
119 def __str__(self):
120 if self.mark is not None:
121 return '%s (%r): %r, %s' % (
122 self.rev, self.mark, self.refs, self.fulltext is not None,
124 else:
125 return '%s: %r, %s' % (self.rev, self.refs, self.fulltext is not None)
128 class WriteBlobSink(Sink):
129 def __init__(self, blobfile, marks):
130 self.blobfile = blobfile
132 # A map {rev : RevRecord} for all of the revisions whose fulltext
133 # will still be needed:
134 self.revrecs = {}
136 # The revisions that need markes will definitely be needed, so
137 # create records for them now (the rest will be filled in while
138 # reading the RCS file):
139 for (rev, mark) in marks.items():
140 self.revrecs[rev] = RevRecord(rev, mark)
142 # The RevRecord of the last fulltext that has been reconstructed,
143 # if it still is_needed():
144 self.last_revrec = None
145 # An RCSStream holding the fulltext of last_revrec:
146 self.last_rcsstream = None
148 # A file to temporarily hold the fulltexts of revisions for which
149 # no blobs are needed:
150 self.fulltext_file = tempfile.TemporaryFile()
152 def __getitem__(self, rev):
153 try:
154 return self.revrecs[rev]
155 except KeyError:
156 revrec = RevRecord(rev)
157 self.revrecs[rev] = revrec
158 return revrec
160 def define_revision(self, rev, timestamp, author, state, branches, next):
161 revrec = self[rev]
163 if next is not None:
164 revrec.refs.add(next)
166 revrec.refs.update(branches)
168 for dependent_rev in revrec.refs:
169 dependent_revrec = self[dependent_rev]
170 assert dependent_revrec.base is None
171 dependent_revrec.base = rev
173 def tree_completed(self):
174 """Remove unneeded RevRecords.
176 Remove the RevRecords for any revisions whose fulltext will not be
177 needed (neither as blob output nor as the base of another needed
178 revision)."""
180 revrecs_to_remove = [
181 revrec
182 for revrec in self.revrecs.itervalues()
183 if not revrec.is_needed()
185 while revrecs_to_remove:
186 revrec = revrecs_to_remove.pop()
187 del self.revrecs[revrec.rev]
188 base_revrec = self[revrec.base]
189 base_revrec.refs.remove(revrec.rev)
190 if not base_revrec.is_needed():
191 revrecs_to_remove.append(base_revrec)
193 def set_revision_info(self, rev, log, text):
194 revrec = self.revrecs.get(rev)
196 if revrec is None:
197 return
199 base_rev = revrec.base
200 if base_rev is None:
201 # This must be the last revision on trunk, for which the
202 # fulltext is stored directly in the RCS file:
203 assert self.last_revrec is None
204 if revrec.mark is not None:
205 revrec.write_blob(self.blobfile, text)
206 if revrec.is_needed():
207 self.last_revrec = revrec
208 self.last_rcsstream = RCSStream(text)
209 elif self.last_revrec is not None and base_rev == self.last_revrec.rev:
210 # Our base revision is stored in self.last_rcsstream.
211 self.last_revrec.refs.remove(rev)
212 if self.last_revrec.is_needed():
213 if not self.last_revrec.is_written():
214 self.last_revrec.write(
215 self.fulltext_file, self.last_rcsstream.get_text()
217 self.last_rcsstream.apply_diff(text)
218 if revrec.mark is not None:
219 revrec.write_blob(self.blobfile, self.last_rcsstream.get_text())
220 if revrec.is_needed():
221 self.last_revrec = revrec
222 else:
223 self.last_revrec = None
224 self.last_rcsstream = None
225 else:
226 # Our base revision is not stored in self.last_rcsstream; it
227 # will have to be obtained from elsewhere.
229 # Store the old last_rcsstream if necessary:
230 if self.last_revrec is not None:
231 if not self.last_revrec.is_written():
232 self.last_revrec.write(
233 self.fulltext_file, self.last_rcsstream.get_text()
235 self.last_revrec = None
236 self.last_rcsstream = None
238 base_revrec = self[base_rev]
239 rcsstream = RCSStream(base_revrec.read_fulltext())
240 base_revrec.refs.remove(rev)
241 rcsstream.apply_diff(text)
242 if revrec.mark is not None:
243 revrec.write_blob(self.blobfile, rcsstream.get_text())
244 if revrec.is_needed():
245 self.last_revrec = revrec
246 self.last_rcsstream = rcsstream
247 del rcsstream
249 def parse_completed(self):
250 self.fulltext_file.close()
253 def main(args):
254 [blobfilename] = args
255 blobfile = open(blobfilename, 'w+b')
256 while True:
257 try:
258 (rcsfile, marks) = pickle.load(sys.stdin)
259 except EOFError:
260 break
261 parse(open(rcsfile, 'rb'), WriteBlobSink(blobfile, marks))
263 blobfile.close()
266 if __name__ == '__main__':
267 main(sys.argv[1:])