Merged the following revisions from branches/graph-based via svnmerge:
[cvs2svn.git] / cvs2svn_lib / collect_data.py
blobbcaa965a0a4d7daca79170067119327f7c30dd4b
1 # (Be in -*- python -*- mode.)
3 # ====================================================================
4 # Copyright (c) 2000-2006 CollabNet. All rights reserved.
6 # This software is licensed as described in the file COPYING, which
7 # you should have received as part of this distribution. The terms
8 # are also available at http://subversion.tigris.org/license-1.html.
9 # If newer versions of this license are posted there, you may use a
10 # newer version instead, at your option.
12 # This software consists of voluntary contributions made by many
13 # individuals. For exact contribution history, see the revision
14 # history and logs, available at http://cvs2svn.tigris.org/.
15 # ====================================================================
17 """This module contains database facilities used by cvs2svn."""
20 from __future__ import generators
22 import sys
23 import os
24 import re
25 import time
26 import sha
27 import stat
29 from boolean import *
30 import common
31 from common import warning_prefix
32 from common import error_prefix
33 import config
34 from log import Log
35 from context import Ctx
36 from artifact_manager import artifact_manager
37 from cvs_file import CVSFile
38 import cvs_revision
39 from stats_keeper import StatsKeeper
40 from key_generator import KeyGenerator
41 import database
42 from cvs_file_database import CVSFileDatabase
43 from cvs_revision_database import CVSRevisionDatabase
44 import symbol_database
45 import cvs2svn_rcsparse
48 OS_SEP_PLUS_ATTIC = os.sep + 'Attic'
50 trunk_rev = re.compile(r'^[0-9]+\.[0-9]+$')
51 cvs_branch_tag = re.compile(r'^((?:[0-9]+\.[0-9]+\.)+)0\.([0-9]+)$')
52 rcs_branch_tag = re.compile(r'^(?:[0-9]+\.[0-9]+\.)+[0-9]+$')
54 # This really only matches standard '1.1.1.*'-style vendor revisions.
55 # One could conceivably have a file whose default branch is 1.1.3 or
56 # whatever, or was that at some point in time, with vendor revisions
57 # 1.1.3.1, 1.1.3.2, etc. But with the default branch gone now (which
58 # is the only time this regexp gets used), we'd have no basis for
59 # assuming that the non-standard vendor branch had ever been the
60 # default branch anyway, so we don't want this to match them anyway.
61 vendor_revision = re.compile(r'^(1\.1\.1)\.([0-9])+$')
64 class _RevisionData:
65 """We track the state of each revision so that in set_revision_info,
66 we can determine if our op is an add/change/delete. We can do this
67 because in set_revision_info, we'll have all of the _RevisionData
68 for a file at our fingertips, and we need to examine the state of
69 our prev_rev to determine if we're an add or a change. Without the
70 state of the prev_rev, we are unable to distinguish between an add
71 and a change."""
73 def __init__(self, timestamp, author, state):
74 self.timestamp = timestamp
75 self.author = author
76 self.original_timestamp = timestamp
77 self._adjusted = False
78 self.state = state
80 def adjust_timestamp(self, timestamp):
81 self._adjusted = True
82 self.timestamp = timestamp
84 def timestamp_was_adjusted(self):
85 return self._adjusted
88 class FileDataCollector(cvs2svn_rcsparse.Sink):
89 """Class responsible for collecting RCS data for a particular file.
91 Any collected data that need to be remembered are stored into the
92 referenced CollectData instance."""
94 def __init__(self, collect_data, filename):
95 """Create an object that is prepared to receive data for FILENAME.
96 FILENAME is the absolute filesystem path to the file in question.
97 COLLECT_DATA is used to store the information collected about the
98 file."""
100 self.collect_data = collect_data
102 (dirname, basename,) = os.path.split(filename)
103 if dirname.endswith(OS_SEP_PLUS_ATTIC):
104 # drop the 'Attic' portion from the filename for the canonical name:
105 canonical_filename = os.path.join(
106 dirname[:-len(OS_SEP_PLUS_ATTIC)], basename)
107 file_in_attic = True
108 else:
109 canonical_filename = filename
110 file_in_attic = False
112 # We calculate and save some file metadata here, where we can do
113 # it only once per file, instead of waiting until later where we
114 # would have to do the same calculations once per CVS *revision*.
116 cvs_path = Ctx().cvs_repository.get_cvs_path(canonical_filename)
118 file_stat = os.stat(filename)
119 # The size of our file in bytes
120 file_size = file_stat[stat.ST_SIZE]
122 # Whether or not the executable bit is set.
123 file_executable = bool(file_stat[0] & stat.S_IXUSR)
125 # mode is not known yet, so we temporarily set it to None.
126 self.cvs_file = CVSFile(
127 None, filename, canonical_filename, cvs_path,
128 file_in_attic, file_executable, file_size, None
131 # A map { revision -> c_rev } of the CVSRevision instances for all
132 # revisions related to this file. Note that items in this map
133 # might be pre-filled as CVSRevisionIDs for revisions referred to
134 # by earlier revisions but not yet processed. As the revisions
135 # are defined, the values are changed into CVSRevision instances.
136 self._c_revs = {}
138 # { revision : _RevisionData instance }
139 self._rev_data = { }
141 # Maps revision number (key) to the revision number of the
142 # previous revision along this line of development.
144 # For the first revision R on a branch, we consider the revision
145 # from which R sprouted to be the 'previous'.
147 # Note that this revision can't be determined arithmetically (due
148 # to cvsadmin -o, which is why this is necessary).
150 # If the key has no previous revision, then store None as key's
151 # value.
152 self.prev_rev = { }
154 # This dict is essentially self.prev_rev with the values mapped in
155 # the other direction, so following key -> value will yield you
156 # the next revision number.
158 # Unlike self.prev_rev, if the key has no next revision, then the
159 # key is not present.
160 self.next_rev = { }
162 # Hash mapping branch numbers, like '1.7.2', to branch names,
163 # like 'Release_1_0_dev'.
164 self.branch_names = { }
166 # Hash mapping revision numbers, like '1.7', to lists of names
167 # indicating which branches sprout from that revision, like
168 # ['Release_1_0_dev', 'experimental_driver', ...].
169 self.branchlist = { }
171 # Like self.branchlist, but the values are lists of tag names that
172 # apply to the key revision.
173 self.taglist = { }
175 # If set, this is an RCS branch number -- rcsparse calls this the
176 # "principal branch", but CVS and RCS refer to it as the "default
177 # branch", so that's what we call it, even though the rcsparse API
178 # setter method is still 'set_principal_branch'.
179 self.default_branch = None
181 # If the RCS file doesn't have a default branch anymore, but does
182 # have vendor revisions, then we make an educated guess that those
183 # revisions *were* the head of the default branch up until the
184 # commit of 1.2, at which point the file's default branch became
185 # trunk. This records the date at which 1.2 was committed.
186 self.first_non_vendor_revision_date = None
188 # A list of all symbols defined for the current file. Used to
189 # prevent multiple definitions of a symbol, something which can
190 # easily happen when --symbol-transform is used.
191 self.defined_symbols = { }
193 def _get_rev_id(self, revision):
194 if revision is None:
195 return None
196 id = self._c_revs.get(revision)
197 if id is None:
198 id = cvs_revision.CVSRevisionID(
199 self.collect_data.key_generator.gen_id(), self.cvs_file, revision)
200 self._c_revs[revision] = id
201 return id.id
203 def set_principal_branch(self, branch):
204 """This is a callback method declared in Sink."""
206 self.default_branch = branch
208 def set_expansion(self, mode):
209 """This is a callback method declared in Sink."""
211 self.cvs_file.mode = mode
213 def set_branch_name(self, branch_number, name):
214 """Record that BRANCH_NUMBER is the branch number for branch NAME,
215 and derive and record the revision from which NAME sprouts.
216 BRANCH_NUMBER is an RCS branch number with an odd number of
217 components, for example '1.7.2' (never '1.7.0.2')."""
219 if self.branch_names.has_key(branch_number):
220 sys.stderr.write("%s: in '%s':\n"
221 " branch '%s' already has name '%s',\n"
222 " cannot also have name '%s', ignoring the latter\n"
223 % (warning_prefix,
224 self.cvs_file.filename, branch_number,
225 self.branch_names[branch_number], name))
226 return
228 self.branch_names[branch_number] = name
229 # The branchlist is keyed on the revision number from which the
230 # branch sprouts, so strip off the odd final component.
231 sprout_rev = branch_number[:branch_number.rfind(".")]
232 self.branchlist.setdefault(sprout_rev, []).append(name)
233 self.collect_data.symbol_db.register_branch_creation(name)
235 def set_tag_name(self, revision, name):
236 """Record that tag NAME refers to the specified REVISION."""
238 self.taglist.setdefault(revision, []).append(name)
239 self.collect_data.symbol_db.register_tag_creation(name)
241 def rev_to_branch_name(self, revision):
242 """Return the name of the branch on which REVISION lies.
243 REVISION is a non-branch revision number with an even number of,
244 components, for example '1.7.2.1' (never '1.7.2' nor '1.7.0.2').
245 For the convenience of callers, REVISION can also be a trunk
246 revision such as '1.2', in which case just return None."""
248 if trunk_rev.match(revision):
249 return None
250 return self.branch_names.get(revision[:revision.rindex(".")])
252 def define_tag(self, name, revision):
253 """Record a bidirectional mapping between symbolic NAME and REVISION.
254 REVISION is an unprocessed revision number from the RCS file's
255 header, for example: '1.7', '1.7.0.2', or '1.1.1' or '1.1.1.1'.
256 This function will determine what kind of symbolic name it is by
257 inspection, and record it in the right places.
259 This is a callback method declared in Sink."""
261 for (pattern, replacement) in Ctx().symbol_transforms:
262 newname = pattern.sub(replacement, name)
263 if newname != name:
264 Log().warn(" symbol '%s' transformed to '%s'" % (name, newname))
265 name = newname
267 if self.defined_symbols.has_key(name):
268 err = "%s: Multiple definitions of the symbol '%s' in '%s'" \
269 % (error_prefix, name, self.cvs_file.filename)
270 sys.stderr.write(err + "\n")
271 self.collect_data.fatal_errors.append(err)
273 self.defined_symbols[name] = None
275 m = cvs_branch_tag.match(revision)
276 if m:
277 self.set_branch_name(m.group(1) + m.group(2), name)
278 elif rcs_branch_tag.match(revision):
279 self.set_branch_name(revision, name)
280 else:
281 self.set_tag_name(revision, name)
283 def admin_completed(self):
284 """This is a callback method declared in Sink."""
286 self.collect_data.add_cvs_file(self.cvs_file)
288 def define_revision(self, revision, timestamp, author, state,
289 branches, next):
290 """This is a callback method declared in Sink."""
292 # store the rev_data as a list in case we have to jigger the timestamp
293 self._rev_data[revision] = _RevisionData(int(timestamp), author, state)
295 # When on trunk, the RCS 'next' revision number points to what
296 # humans might consider to be the 'previous' revision number. For
297 # example, 1.3's RCS 'next' is 1.2.
299 # However, on a branch, the RCS 'next' revision number really does
300 # point to what humans would consider to be the 'next' revision
301 # number. For example, 1.1.2.1's RCS 'next' would be 1.1.2.2.
303 # In other words, in RCS, 'next' always means "where to find the next
304 # deltatext that you need this revision to retrieve.
306 # That said, we don't *want* RCS's behavior here, so we determine
307 # whether we're on trunk or a branch and set self.prev_rev
308 # accordingly.
310 # One last thing. Note that if REVISION is a branch revision,
311 # instead of mapping REVISION to NEXT, we instead map NEXT to
312 # REVISION. Since we loop over all revisions in the file before
313 # doing anything with the data we gather here, this 'reverse
314 # assignment' effectively does the following:
316 # 1. Gives us no 'prev' value for REVISION (in this
317 # iteration... it may have been set in a previous iteration)
319 # 2. Sets the 'prev' value for the revision with number NEXT to
320 # REVISION. So when we come around to the branch revision whose
321 # revision value is NEXT, its 'prev' and 'prev_rev' are already
322 # set.
323 if trunk_rev.match(revision):
324 self.prev_rev[revision] = next
325 self.next_rev[next] = revision
326 elif next:
327 self.prev_rev[next] = revision
328 self.next_rev[revision] = next
330 for b in branches:
331 self.prev_rev[b] = revision
333 # Ratchet up the highest vendor head revision, if necessary.
334 if self.default_branch:
335 default_branch_root = self.default_branch + "."
336 if (revision.startswith(default_branch_root)
337 and default_branch_root.count('.') == revision.count('.')):
338 # This revision is on the default branch, so record that it is
339 # the new highest default branch head revision.
340 self.collect_data.default_branches_db[self.cvs_file.cvs_path] = \
341 revision
342 else:
343 # No default branch, so make an educated guess.
344 if revision == '1.2':
345 # This is probably the time when the file stopped having a
346 # default branch, so make a note of it.
347 self.first_non_vendor_revision_date = timestamp
348 else:
349 m = vendor_revision.match(revision)
350 if m and ((not self.first_non_vendor_revision_date)
351 or (timestamp < self.first_non_vendor_revision_date)):
352 # We're looking at a vendor revision, and it wasn't
353 # committed after this file lost its default branch, so bump
354 # the maximum trunk vendor revision in the permanent record.
355 self.collect_data.default_branches_db[self.cvs_file.cvs_path] = \
356 revision
358 if not trunk_rev.match(revision):
359 # Check for unlabeled branches, record them. We tried to collect
360 # all branch names when we parsed the symbolic name header
361 # earlier, of course, but that didn't catch unlabeled branches.
362 # If a branch is unlabeled, this is our first encounter with it,
363 # so we have to record its data now.
364 branch_number = revision[:revision.rindex(".")]
365 if not self.branch_names.has_key(branch_number):
366 branch_name = "unlabeled-" + branch_number
367 self.set_branch_name(branch_number, branch_name)
369 # Register the commit on this non-trunk branch
370 branch_name = self.branch_names[branch_number]
371 self.collect_data.symbol_db.register_branch_commit(branch_name)
373 def _resync_chain(self, current, prev):
374 """If the PREV revision exists and it occurred later than the
375 CURRENT revision, then shove the previous revision back in time
376 (and any before it that may need to shift). Return True iff any
377 resyncing was done.
379 We sync backwards and not forwards because any given CVS Revision
380 has only one previous revision. However, a CVS Revision can *be*
381 a previous revision for many other revisions (e.g., a revision
382 that is the source of multiple branches). This becomes relevant
383 when we do the secondary synchronization in pass 2--we can make
384 certain that we don't resync a revision earlier than its previous
385 revision, but it would be non-trivial to make sure that we don't
386 resync revision R *after* any revisions that have R as a previous
387 revision."""
389 resynced = False
390 while prev is not None:
391 current_rev_data = self._rev_data[current]
392 prev_rev_data = self._rev_data[prev]
394 if prev_rev_data.timestamp < current_rev_data.timestamp:
395 # No resyncing needed here.
396 return resynced
398 old_timestamp = prev_rev_data.timestamp
399 prev_rev_data.adjust_timestamp(current_rev_data.timestamp - 1)
400 resynced = True
401 delta = prev_rev_data.timestamp - old_timestamp
402 Log().verbose(
403 "PASS1 RESYNC: '%s' (%s): old time='%s' delta=%ds"
404 % (self.cvs_file.cvs_path, prev,
405 time.ctime(old_timestamp), delta))
406 if abs(delta) > config.COMMIT_THRESHOLD:
407 Log().warn(
408 "%s: Significant timestamp change for '%s' (%d seconds)"
409 % (warning_prefix, self.cvs_file.cvs_path, delta))
410 current = prev
411 prev = self.prev_rev[current]
413 return resynced
415 def tree_completed(self):
416 """The revision tree has been parsed. Analyze it for consistency.
418 This is a callback method declared in Sink."""
420 # Our algorithm depends upon the timestamps on the revisions occuring
421 # monotonically over time. That is, we want to see rev 1.34 occur in
422 # time before rev 1.35. If we inserted 1.35 *first* (due to the time-
423 # sorting), and then tried to insert 1.34, we'd be screwed.
425 # To perform the analysis, we'll simply visit all of the 'previous'
426 # links that we have recorded and validate that the timestamp on the
427 # previous revision is before the specified revision.
429 # If we have to resync some nodes, then we restart the scan. Just
430 # keep looping as long as we need to restart.
431 while True:
432 for current, prev in self.prev_rev.items():
433 if self._resync_chain(current, prev):
434 # Abort for loop, causing the scan to start again:
435 break
436 else:
437 # Finished the for-loop without having to resync anything.
438 # We're done.
439 return
441 def set_revision_info(self, revision, log, text):
442 """This is a callback method declared in Sink."""
444 rev_data = self._rev_data[revision]
445 digest = sha.new(log + '\0' + rev_data.author).hexdigest()
446 if rev_data.timestamp_was_adjusted():
447 # the timestamp on this revision was changed. log it for later
448 # resynchronization of other files's revisions that occurred
449 # for this time and log message.
450 self.collect_data.resync.write(
451 '%08lx %s %08lx\n'
452 % (rev_data.original_timestamp, digest, rev_data.timestamp))
454 # "...Give back one kadam to honor the Hebrew God whose Ark this is."
455 # -- Imam to Indy and Sallah, in 'Raiders of the Lost Ark'
457 # If revision 1.1 appears to have been created via 'cvs add'
458 # instead of 'cvs import', then this file probably never had a
459 # default branch, so retroactively remove its record in the
460 # default branches db. The test is that the log message CVS uses
461 # for 1.1 in imports is "Initial revision\n" with no period.
462 if revision == '1.1' and log != 'Initial revision\n':
463 try:
464 del self.collect_data.default_branches_db[self.cvs_file.cvs_path]
465 except KeyError:
466 pass
468 # Get the timestamps of the previous and next revisions
469 prev_rev = self.prev_rev[revision]
470 prev_rev_data = self._rev_data.get(prev_rev)
471 if prev_rev_data is None:
472 prev_timestamp = 0
473 else:
474 prev_timestamp = prev_rev_data.timestamp
476 next_rev = self.next_rev.get(revision)
477 next_rev_data = self._rev_data.get(next_rev)
478 if next_rev_data is None:
479 next_timestamp = 0
480 else:
481 next_timestamp = next_rev_data.timestamp
483 # How to tell if a CVSRevision is an add, a change, or a deletion:
485 # It's a delete if RCS state is 'dead'
487 # It's an add if RCS state is 'Exp.' and
488 # - we either have no previous revision
489 # or
490 # - we have a previous revision whose state is 'dead'
492 # Anything else is a change.
493 if rev_data.state == 'dead':
494 op = common.OP_DELETE
495 elif prev_rev_data is None or prev_rev_data.state == 'dead':
496 op = common.OP_ADD
497 else:
498 op = common.OP_CHANGE
500 def is_branch_revision(rev):
501 """Return True if this revision is not a trunk revision,
502 else return False."""
504 if rev.count('.') >= 3:
505 return True
506 return False
508 def is_same_line_of_development(rev1, rev2):
509 """Return True if rev1 and rev2 are on the same line of
510 development (i.e., both on trunk, or both on the same branch);
511 return False otherwise. Either rev1 or rev2 can be None, in
512 which case automatically return False."""
514 if rev1 is None or rev2 is None:
515 return False
516 if rev1.count('.') == 1 and rev2.count('.') == 1:
517 return True
518 if rev1[0:rev1.rfind('.')] == rev2[0:rev2.rfind('.')]:
519 return True
520 return False
522 # There can be an odd situation where the tip revision of a branch
523 # is alive, but every predecessor on the branch is in state 'dead',
524 # yet the revision from which the branch sprouts is alive. (This
525 # is sort of a mirror image of the more common case of adding a
526 # file on a branch, in which the first revision on the branch is
527 # alive while the revision from which it sprouts is dead.)
529 # In this odd situation, we must mark the first live revision on
530 # the branch as an OP_CHANGE instead of an OP_ADD, because it
531 # reflects, however indirectly, a change w.r.t. the source
532 # revision from which the branch sprouts.
534 # This is issue #89.
535 cur_num = revision
536 if is_branch_revision(revision) and rev_data.state != 'dead':
537 while 1:
538 prev_num = self.prev_rev.get(cur_num, None)
539 if not cur_num or not prev_num:
540 break
541 if (not is_same_line_of_development(cur_num, prev_num)
542 and self._rev_data[cur_num].state == 'dead'
543 and self._rev_data[prev_num].state != 'dead'):
544 op = common.OP_CHANGE
545 cur_num = self.prev_rev.get(cur_num, None)
547 c_rev = cvs_revision.CVSRevision(
548 self._get_rev_id(revision), self.cvs_file,
549 rev_data.timestamp, digest,
550 self._get_rev_id(prev_rev), self._get_rev_id(next_rev),
551 prev_timestamp, next_timestamp, op,
552 prev_rev, revision, next_rev,
553 bool(text),
554 self.rev_to_branch_name(revision),
555 self.taglist.get(revision, []), self.branchlist.get(revision, []))
556 self._c_revs[revision] = c_rev
557 self.collect_data.add_cvs_revision(c_rev)
559 if not self.collect_data.metadata_db.has_key(digest):
560 self.collect_data.metadata_db[digest] = (rev_data.author, log)
562 def parse_completed(self):
563 """Walk through all branches and tags and register them with their
564 parent branch in the symbol database.
566 This is a callback method declared in Sink."""
568 for revision, symbols in self.taglist.items() + self.branchlist.items():
569 for symbol in symbols:
570 name = self.rev_to_branch_name(revision)
571 if name is not None:
572 self.collect_data.symbol_db.register_branch_blocker(name, symbol)
574 self.collect_data.num_files += 1
577 class CollectData:
578 """Repository for data collected by parsing the CVS repository files.
580 This class manages the databases into which information collected
581 from the CVS repository is stored. The data are stored into this
582 class by FileDataCollector instances, one of which is created for
583 each file to be parsed."""
585 def __init__(self):
586 self._cvs_file_db = CVSFileDatabase(
587 artifact_manager.get_temp_file(config.CVS_FILES_DB),
588 database.DB_OPEN_NEW)
589 self._cvs_revs_db = CVSRevisionDatabase(
590 self._cvs_file_db,
591 artifact_manager.get_temp_file(config.CVS_REVS_DB),
592 database.DB_OPEN_NEW)
593 self._all_revs = open(
594 artifact_manager.get_temp_file(config.ALL_REVS_DATAFILE), 'w')
595 self.resync = open(
596 artifact_manager.get_temp_file(config.RESYNC_DATAFILE), 'w')
597 self.default_branches_db = database.SDatabase(
598 artifact_manager.get_temp_file(config.DEFAULT_BRANCHES_DB),
599 database.DB_OPEN_NEW)
600 self.metadata_db = database.Database(
601 artifact_manager.get_temp_file(config.METADATA_DB),
602 database.DB_OPEN_NEW)
603 self.fatal_errors = []
604 self.num_files = 0
605 self.symbol_db = symbol_database.SymbolDatabase()
607 # 1 if we've collected data for at least one file, None otherwise.
608 self.found_valid_file = None
610 # Key generator to generate unique keys for each CVSFile object:
611 self.file_key_generator = KeyGenerator(1)
613 # Key generator to generate unique keys for each CVSRevision object:
614 self.key_generator = KeyGenerator()
616 def add_cvs_file(self, cvs_file):
617 """If CVS_FILE is not already stored to _cvs_revs_db, give it a
618 persistent id and store it now. The way we tell whether it was
619 already stored is by whether it already has a non-None id."""
621 assert cvs_file.id is None
622 cvs_file.id = self.file_key_generator.gen_id()
623 self._cvs_file_db.log_file(cvs_file)
625 def add_cvs_revision(self, c_rev):
626 self._cvs_revs_db.log_revision(c_rev)
627 self._all_revs.write('%s\n' % (c_rev.unique_key(),))
628 StatsKeeper().record_c_rev(c_rev)
630 def write_symbol_db(self):
631 self.symbol_db.write()