Make minimal changes to get HTML files to be valid XHTML, dropping from Strict
[cvs2svn.git] / cvs2svn_lib / collect_data.py
bloba4935072aebf9827f77443b29fd98059b10b7604
1 # (Be in -*- python -*- mode.)
3 # ====================================================================
4 # Copyright (c) 2000-2006 CollabNet. All rights reserved.
6 # This software is licensed as described in the file COPYING, which
7 # you should have received as part of this distribution. The terms
8 # are also available at http://subversion.tigris.org/license-1.html.
9 # If newer versions of this license are posted there, you may use a
10 # newer version instead, at your option.
12 # This software consists of voluntary contributions made by many
13 # individuals. For exact contribution history, see the revision
14 # history and logs, available at http://cvs2svn.tigris.org/.
15 # ====================================================================
17 """This module contains database facilities used by cvs2svn."""
20 from __future__ import generators
22 import sys
23 import os
24 import re
25 import time
26 import sha
27 import stat
29 from boolean import *
30 import common
31 from common import warning_prefix
32 from common import error_prefix
33 import config
34 from log import Log
35 from context import Ctx
36 from artifact_manager import artifact_manager
37 import cvs_revision
38 from stats_keeper import StatsKeeper
39 import database
40 import symbol_database
41 import cvs2svn_rcsparse
44 trunk_rev = re.compile('^[0-9]+\\.[0-9]+$')
45 cvs_branch_tag = re.compile('^((?:[0-9]+\\.[0-9]+\\.)+)0\\.([0-9]+)$')
46 rcs_branch_tag = re.compile('^(?:[0-9]+\\.[0-9]+\\.)+[0-9]+$')
48 # This really only matches standard '1.1.1.*'-style vendor revisions.
49 # One could conceivably have a file whose default branch is 1.1.3 or
50 # whatever, or was that at some point in time, with vendor revisions
51 # 1.1.3.1, 1.1.3.2, etc. But with the default branch gone now (which
52 # is the only time this regexp gets used), we'd have no basis for
53 # assuming that the non-standard vendor branch had ever been the
54 # default branch anyway, so we don't want this to match them anyway.
55 vendor_revision = re.compile('^(1\\.1\\.1)\\.([0-9])+$')
58 class FileDataCollector(cvs2svn_rcsparse.Sink):
59 """Class responsible for collecting RCS data for a particular file.
61 Any collected data that need to be remembered are stored into the
62 referenced CollectData instance."""
64 def __init__(self, collect_data, canonical_name, filename):
65 """Create an object that is prepared to receive data for FILENAME.
66 FILENAME is the absolute filesystem path to the file in question,
67 and CANONICAL_NAME is FILENAME with the 'Attic' component removed
68 (if the file is indeed in the Attic). COLLECT_DATA is used to
69 store the information collected about the file."""
71 self.collect_data = collect_data
73 self.fname = canonical_name
75 # We calculate and save some file metadata here, where we can do
76 # it only once per file, instead of waiting until later where we
77 # would have to do the same calculations once per CVS *revision*.
79 self.cvs_path = Ctx().cvs_repository.get_cvs_path(self.fname)
81 # If the paths are not the same, then that means that the
82 # canonical_name has had the 'Attic' component stripped out.
83 self.file_in_attic = (canonical_name != filename)
85 file_stat = os.stat(filename)
86 # The size of our file in bytes
87 self.file_size = file_stat[stat.ST_SIZE]
89 # Whether or not the executable bit is set.
90 self.file_executable = bool(file_stat[0] & stat.S_IXUSR)
92 # revision -> [timestamp, author, old-timestamp]
93 self.rev_data = { }
95 # Maps revision number (key) to the revision number of the
96 # previous revision along this line of development.
98 # For the first revision R on a branch, we consider the revision
99 # from which R sprouted to be the 'previous'.
101 # Note that this revision can't be determined arithmetically (due
102 # to cvsadmin -o, which is why this is necessary).
104 # If the key has no previous revision, then store None as key's
105 # value.
106 self.prev_rev = { }
108 # This dict is essentially self.prev_rev with the values mapped in
109 # the other direction, so following key -> value will yield you
110 # the next revision number.
112 # Unlike self.prev_rev, if the key has no next revision, then the
113 # key is not present.
114 self.next_rev = { }
116 # Track the state of each revision so that in set_revision_info,
117 # we can determine if our op is an add/change/delete. We can do
118 # this because in set_revision_info, we'll have all of the
119 # revisions for a file at our fingertips, and we need to examine
120 # the state of our prev_rev to determine if we're an add or a
121 # change--without the state of the prev_rev, we are unable to
122 # distinguish between an add and a change.
123 self.rev_state = { }
125 # Hash mapping branch numbers, like '1.7.2', to branch names,
126 # like 'Release_1_0_dev'.
127 self.branch_names = { }
129 # RCS flags (used for keyword expansion).
130 self.mode = None
132 # Hash mapping revision numbers, like '1.7', to lists of names
133 # indicating which branches sprout from that revision, like
134 # ['Release_1_0_dev', 'experimental_driver', ...].
135 self.branchlist = { }
137 # Like self.branchlist, but the values are lists of tag names that
138 # apply to the key revision.
139 self.taglist = { }
141 # If set, this is an RCS branch number -- rcsparse calls this the
142 # "principal branch", but CVS and RCS refer to it as the "default
143 # branch", so that's what we call it, even though the rcsparse API
144 # setter method is still 'set_principal_branch'.
145 self.default_branch = None
147 # If the RCS file doesn't have a default branch anymore, but does
148 # have vendor revisions, then we make an educated guess that those
149 # revisions *were* the head of the default branch up until the
150 # commit of 1.2, at which point the file's default branch became
151 # trunk. This records the date at which 1.2 was committed.
152 self.first_non_vendor_revision_date = None
154 # A list of all symbols defined for the current file. Used to
155 # prevent multiple definitions of a symbol, something which can
156 # easily happen when --symbol-transform is used.
157 self.defined_symbols = { }
159 def set_principal_branch(self, branch):
160 self.default_branch = branch
162 def set_expansion(self, mode):
163 self.mode = mode
165 def set_branch_name(self, branch_number, name):
166 """Record that BRANCH_NUMBER is the branch number for branch NAME,
167 and that NAME sprouts from BRANCH_NUMBER.
168 BRANCH_NUMBER is an RCS branch number with an odd number of components,
169 for example '1.7.2' (never '1.7.0.2')."""
171 if not self.branch_names.has_key(branch_number):
172 self.branch_names[branch_number] = name
173 # The branchlist is keyed on the revision number from which the
174 # branch sprouts, so strip off the odd final component.
175 sprout_rev = branch_number[:branch_number.rfind(".")]
176 self.branchlist.setdefault(sprout_rev, []).append(name)
177 self.collect_data.symbol_db.register_branch_creation(name)
178 else:
179 sys.stderr.write("%s: in '%s':\n"
180 " branch '%s' already has name '%s',\n"
181 " cannot also have name '%s', ignoring the latter\n"
182 % (warning_prefix, self.fname, branch_number,
183 self.branch_names[branch_number], name))
185 def rev_to_branch_name(self, revision):
186 """Return the name of the branch on which REVISION lies.
187 REVISION is a non-branch revision number with an even number of,
188 components, for example '1.7.2.1' (never '1.7.2' nor '1.7.0.2').
189 For the convenience of callers, REVISION can also be a trunk
190 revision such as '1.2', in which case just return None."""
192 if trunk_rev.match(revision):
193 return None
194 return self.branch_names.get(revision[:revision.rindex(".")])
196 def define_tag(self, name, revision):
197 """Record a bidirectional mapping between symbolic NAME and REVISION.
198 REVISION is an unprocessed revision number from the RCS file's
199 header, for example: '1.7', '1.7.0.2', or '1.1.1' or '1.1.1.1'.
200 This function will determine what kind of symbolic name it is by
201 inspection, and record it in the right places."""
203 for (pattern, replacement) in Ctx().symbol_transforms:
204 newname = pattern.sub(replacement, name)
205 if newname != name:
206 Log().write(Log.WARN, " symbol '%s' transformed to '%s'"
207 % (name, newname))
208 name = newname
209 if self.defined_symbols.has_key(name):
210 err = "%s: Multiple definitions of the symbol '%s' in '%s'" \
211 % (error_prefix, name, self.fname)
212 sys.stderr.write(err + "\n")
213 self.collect_data.fatal_errors.append(err)
214 self.defined_symbols[name] = None
215 m = cvs_branch_tag.match(revision)
216 if m:
217 self.set_branch_name(m.group(1) + m.group(2), name)
218 elif rcs_branch_tag.match(revision):
219 self.set_branch_name(revision, name)
220 else:
221 self.taglist.setdefault(revision, []).append(name)
222 self.collect_data.symbol_db.register_tag_creation(name)
224 def define_revision(self, revision, timestamp, author, state,
225 branches, next):
226 # Record the state of our revision for later calculations
227 self.rev_state[revision] = state
229 # store the rev_data as a list in case we have to jigger the timestamp
230 self.rev_data[revision] = [int(timestamp), author, None]
232 # When on trunk, the RCS 'next' revision number points to what
233 # humans might consider to be the 'previous' revision number. For
234 # example, 1.3's RCS 'next' is 1.2.
236 # However, on a branch, the RCS 'next' revision number really does
237 # point to what humans would consider to be the 'next' revision
238 # number. For example, 1.1.2.1's RCS 'next' would be 1.1.2.2.
240 # In other words, in RCS, 'next' always means "where to find the next
241 # deltatext that you need this revision to retrieve.
243 # That said, we don't *want* RCS's behavior here, so we determine
244 # whether we're on trunk or a branch and set self.prev_rev
245 # accordingly.
247 # One last thing. Note that if REVISION is a branch revision,
248 # instead of mapping REVISION to NEXT, we instead map NEXT to
249 # REVISION. Since we loop over all revisions in the file before
250 # doing anything with the data we gather here, this 'reverse
251 # assignment' effectively does the following:
253 # 1. Gives us no 'prev' value for REVISION (in this
254 # iteration... it may have been set in a previous iteration)
256 # 2. Sets the 'prev' value for the revision with number NEXT to
257 # REVISION. So when we come around to the branch revision whose
258 # revision value is NEXT, its 'prev' and 'prev_rev' are already
259 # set.
260 if trunk_rev.match(revision):
261 self.prev_rev[revision] = next
262 self.next_rev[next] = revision
263 elif next:
264 self.prev_rev[next] = revision
265 self.next_rev[revision] = next
267 for b in branches:
268 self.prev_rev[b] = revision
270 # Ratchet up the highest vendor head revision, if necessary.
271 if self.default_branch:
272 default_branch_root = self.default_branch + "."
273 if ((revision.find(default_branch_root) == 0)
274 and (default_branch_root.count('.') == revision.count('.'))):
275 # This revision is on the default branch, so record that it is
276 # the new highest default branch head revision.
277 self.collect_data.default_branches_db[self.cvs_path] = revision
278 else:
279 # No default branch, so make an educated guess.
280 if revision == '1.2':
281 # This is probably the time when the file stopped having a
282 # default branch, so make a note of it.
283 self.first_non_vendor_revision_date = timestamp
284 else:
285 m = vendor_revision.match(revision)
286 if m and ((not self.first_non_vendor_revision_date)
287 or (timestamp < self.first_non_vendor_revision_date)):
288 # We're looking at a vendor revision, and it wasn't
289 # committed after this file lost its default branch, so bump
290 # the maximum trunk vendor revision in the permanent record.
291 self.collect_data.default_branches_db[self.cvs_path] = revision
293 if not trunk_rev.match(revision):
294 # Check for unlabeled branches, record them. We tried to collect
295 # all branch names when we parsed the symbolic name header
296 # earlier, of course, but that didn't catch unlabeled branches.
297 # If a branch is unlabeled, this is our first encounter with it,
298 # so we have to record its data now.
299 branch_number = revision[:revision.rindex(".")]
300 if not self.branch_names.has_key(branch_number):
301 branch_name = "unlabeled-" + branch_number
302 self.set_branch_name(branch_number, branch_name)
304 # Register the commit on this non-trunk branch
305 branch_name = self.branch_names[branch_number]
306 self.collect_data.symbol_db.register_branch_commit(branch_name)
308 def tree_completed(self):
309 """The revision tree has been parsed. Analyze it for consistency."""
311 # Our algorithm depends upon the timestamps on the revisions occuring
312 # monotonically over time. That is, we want to see rev 1.34 occur in
313 # time before rev 1.35. If we inserted 1.35 *first* (due to the time-
314 # sorting), and then tried to insert 1.34, we'd be screwed.
316 # to perform the analysis, we'll simply visit all of the 'previous'
317 # links that we have recorded and validate that the timestamp on the
318 # previous revision is before the specified revision
320 # if we have to resync some nodes, then we restart the scan. just keep
321 # looping as long as we need to restart.
322 while 1:
323 for current, prev in self.prev_rev.items():
324 if not prev:
325 # no previous revision exists (i.e. the initial revision)
326 continue
327 t_c = self.rev_data[current][0]
328 t_p = self.rev_data[prev][0]
329 if t_p >= t_c:
330 # the previous revision occurred later than the current revision.
331 # shove the previous revision back in time (and any before it that
332 # may need to shift).
334 # We sync backwards and not forwards because any given CVS
335 # Revision has only one previous revision. However, a CVS
336 # Revision can *be* a previous revision for many other
337 # revisions (e.g., a revision that is the source of multiple
338 # branches). This becomes relevant when we do the secondary
339 # synchronization in pass 2--we can make certain that we
340 # don't resync a revision earlier than it's previous
341 # revision, but it would be non-trivial to make sure that we
342 # don't resync revision R *after* any revisions that have R
343 # as a previous revision.
344 while t_p >= t_c:
345 self.rev_data[prev][0] = t_c - 1 # new timestamp
346 self.rev_data[prev][2] = t_p # old timestamp
347 delta = t_c - 1 - t_p
348 msg = "PASS1 RESYNC: '%s' (%s): old time='%s' delta=%ds" \
349 % (self.cvs_path, prev, time.ctime(t_p), delta)
350 Log().write(Log.VERBOSE, msg)
351 if (delta > config.COMMIT_THRESHOLD
352 or delta < (config.COMMIT_THRESHOLD * -1)):
353 Log().write(Log.WARN,
354 "%s: Significant timestamp change for '%s' "
355 "(%d seconds)"
356 % (warning_prefix, self.cvs_path, delta))
357 current = prev
358 prev = self.prev_rev[current]
359 if not prev:
360 break
361 t_c -= 1 # self.rev_data[current][0]
362 t_p = self.rev_data[prev][0]
364 # break from the for-loop
365 break
366 else:
367 # finished the for-loop (no resyncing was performed)
368 return
370 def set_revision_info(self, revision, log, text):
371 timestamp, author, old_ts = self.rev_data[revision]
372 digest = sha.new(log + '\0' + author).hexdigest()
373 if old_ts:
374 # the timestamp on this revision was changed. log it for later
375 # resynchronization of other files's revisions that occurred
376 # for this time and log message.
377 self.collect_data.resync.write('%08lx %s %08lx\n'
378 % (old_ts, digest, timestamp))
380 # "...Give back one kadam to honor the Hebrew God whose Ark this is."
381 # -- Imam to Indy and Sallah, in 'Raiders of the Lost Ark'
383 # If revision 1.1 appears to have been created via 'cvs add'
384 # instead of 'cvs import', then this file probably never had a
385 # default branch, so retroactively remove its record in the
386 # default branches db. The test is that the log message CVS uses
387 # for 1.1 in imports is "Initial revision\n" with no period.
388 if revision == '1.1' and log != 'Initial revision\n':
389 try:
390 del self.collect_data.default_branches_db[self.cvs_path]
391 except KeyError:
392 pass
394 # Get the timestamps of the previous and next revisions
395 prev_rev = self.prev_rev[revision]
396 prev_timestamp, ign, ign = self.rev_data.get(prev_rev, [0, None, None])
398 next_rev = self.next_rev.get(revision)
399 next_timestamp, ign, ign = self.rev_data.get(next_rev, [0, None, None])
401 # How to tell if a CVSRevision is an add, a change, or a deletion:
403 # It's a delete if RCS state is 'dead'
405 # It's an add if RCS state is 'Exp.' and
406 # - we either have no previous revision
407 # or
408 # - we have a previous revision whose state is 'dead'
410 # Anything else is a change.
411 if self.rev_state[revision] == 'dead':
412 op = common.OP_DELETE
413 elif ((self.prev_rev.get(revision, None) is None)
414 or (self.rev_state[self.prev_rev[revision]] == 'dead')):
415 op = common.OP_ADD
416 else:
417 op = common.OP_CHANGE
419 def is_branch_revision(rev):
420 """Return True if this revision is not a trunk revision,
421 else return False."""
423 if rev.count('.') >= 3:
424 return True
425 return False
427 def is_same_line_of_development(rev1, rev2):
428 """Return True if rev1 and rev2 are on the same line of
429 development (i.e., both on trunk, or both on the same branch);
430 return False otherwise. Either rev1 or rev2 can be None, in
431 which case automatically return False."""
433 if rev1 is None or rev2 is None:
434 return False
435 if rev1.count('.') == 1 and rev2.count('.') == 1:
436 return True
437 if rev1[0:rev1.rfind('.')] == rev2[0:rev2.rfind('.')]:
438 return True
439 return False
441 # There can be an odd situation where the tip revision of a branch
442 # is alive, but every predecessor on the branch is in state 'dead',
443 # yet the revision from which the branch sprouts is alive. (This
444 # is sort of a mirror image of the more common case of adding a
445 # file on a branch, in which the first revision on the branch is
446 # alive while the revision from which it sprouts is dead.)
448 # In this odd situation, we must mark the first live revision on
449 # the branch as an OP_CHANGE instead of an OP_ADD, because it
450 # reflects, however indirectly, a change w.r.t. the source
451 # revision from which the branch sprouts.
453 # This is issue #89.
454 cur_num = revision
455 if is_branch_revision(revision) and self.rev_state[revision] != 'dead':
456 while 1:
457 prev_num = self.prev_rev.get(cur_num, None)
458 if not cur_num or not prev_num:
459 break
460 if (not is_same_line_of_development(cur_num, prev_num)
461 and self.rev_state[cur_num] == 'dead'
462 and self.rev_state[prev_num] != 'dead'):
463 op = common.OP_CHANGE
464 cur_num = self.prev_rev.get(cur_num, None)
466 c_rev = cvs_revision.CVSRevision(
467 timestamp, digest, prev_timestamp, next_timestamp, op,
468 prev_rev, revision, next_rev,
469 self.file_in_attic, self.file_executable, self.file_size,
470 bool(text), self.fname, self.mode,
471 self.rev_to_branch_name(revision),
472 self.taglist.get(revision, []), self.branchlist.get(revision, []))
473 self.collect_data.add_cvs_revision(c_rev)
475 if not self.collect_data.metadata_db.has_key(digest):
476 self.collect_data.metadata_db[digest] = (author, log)
478 def parse_completed(self):
479 # Walk through all branches and tags and register them with
480 # their parent branch in the symbol database.
481 for revision, symbols in self.taglist.items() + self.branchlist.items():
482 for symbol in symbols:
483 name = self.rev_to_branch_name(revision)
484 if name is not None:
485 self.collect_data.symbol_db.register_branch_blocker(name, symbol)
487 self.collect_data.num_files += 1
490 class CollectData:
491 """Repository for data collected by parsing the CVS repository files.
493 This class manages the databases into which information collected
494 from the CVS repository is stored. The data are stored into this
495 class by FileDataCollector instances, one of which is created for
496 each file to be parsed."""
498 def __init__(self):
499 self._revs = open(artifact_manager.get_temp_file(config.REVS_DATAFILE),
500 'w')
501 self.resync = open(
502 artifact_manager.get_temp_file(config.RESYNC_DATAFILE), 'w')
503 self.default_branches_db = database.SDatabase(
504 artifact_manager.get_temp_file(config.DEFAULT_BRANCHES_DB),
505 database.DB_OPEN_NEW)
506 self.metadata_db = database.Database(
507 artifact_manager.get_temp_file(config.METADATA_DB),
508 database.DB_OPEN_NEW)
509 self.fatal_errors = []
510 self.num_files = 0
511 self.symbol_db = symbol_database.SymbolDatabase()
513 # 1 if we've collected data for at least one file, None otherwise.
514 self.found_valid_file = None
516 def add_cvs_revision(self, c_rev):
517 self._revs.write(c_rev.__getstate__() + '\n')
518 StatsKeeper().record_c_rev(c_rev)
520 def write_symbol_db(self):
521 self.symbol_db.write()