cvs2svn_lib/collect_data.py

   1 # (Be in -*- python -*- mode.)
   2 #
   3 # ====================================================================
   4 # Copyright (c) 2000-2006 CollabNet.  All rights reserved.
   5 #
   6 # This software is licensed as described in the file COPYING, which
   7 # you should have received as part of this distribution.  The terms
   8 # are also available at http://subversion.tigris.org/license-1.html.
   9 # If newer versions of this license are posted there, you may use a
  10 # newer version instead, at your option.
  11 #
  12 # This software consists of voluntary contributions made by many
  13 # individuals.  For exact contribution history, see the revision
  14 # history and logs, available at http://cvs2svn.tigris.org/.
  15 # ====================================================================
  16
  17 """This module contains database facilities used by cvs2svn."""
  18
  19
  20 from __future__ import generators
  21
  22 import sys
  23 import os
  24 import re
  25 import time
  26 import sha
  27 import stat
  28
  29 from boolean import *
  30 import common
  31 from common import warning_prefix
  32 from common import error_prefix
  33 import config
  34 from log import Log
  35 from context import Ctx
  36 from artifact_manager import artifact_manager
  37 from cvs_file import CVSFile
  38 import cvs_revision
  39 from stats_keeper import StatsKeeper
  40 from key_generator import KeyGenerator
  41 import database
  42 from cvs_file_database import CVSFileDatabase
  43 from cvs_revision_database import CVSRevisionDatabase
  44 import symbol_database
  45 import cvs2svn_rcsparse
  46
  47
  48 OS_SEP_PLUS_ATTIC = os.sep + 'Attic'
  49
  50 trunk_rev = re.compile(r'^[0-9]+\.[0-9]+$')
  51 cvs_branch_tag = re.compile(r'^((?:[0-9]+\.[0-9]+\.)+)0\.([0-9]+)$')
  52 rcs_branch_tag = re.compile(r'^(?:[0-9]+\.[0-9]+\.)+[0-9]+$')
  53
  54 # This really only matches standard '1.1.1.*'-style vendor revisions.
  55 # One could conceivably have a file whose default branch is 1.1.3 or
  56 # whatever, or was that at some point in time, with vendor revisions
  57 # 1.1.3.1, 1.1.3.2, etc.  But with the default branch gone now (which
  58 # is the only time this regexp gets used), we'd have no basis for
  59 # assuming that the non-standard vendor branch had ever been the
  60 # default branch anyway, so we don't want this to match them anyway.
  61 vendor_revision = re.compile(r'^(1\.1\.1)\.([0-9])+$')
  62
  63
  64 def is_branch_revision(rev):
  65   """Return True iff this revision is not a trunk revision."""
  66
  67   return rev.count('.') >= 3
  68
  69
  70 def is_same_line_of_development(rev1, rev2):
  71   """Return True if rev1 and rev2 are on the same line of
  72   development (i.e., both on trunk, or both on the same branch);
  73   return False otherwise.  Either rev1 or rev2 can be None, in
  74   which case automatically return False."""
  75
  76   if rev1 is None or rev2 is None:
  77     return False
  78   if rev1.count('.') == 1 and rev2.count('.') == 1:
  79     return True
  80   if rev1[0:rev1.rfind('.')] == rev2[0:rev2.rfind('.')]:
  81     return True
  82   return False
  83
  84
  85 class _RevisionData:
  86   """We track the state of each revision so that in set_revision_info,
  87   we can determine if our op is an add/change/delete.  We can do this
  88   because in set_revision_info, we'll have all of the _RevisionData
  89   for a file at our fingertips, and we need to examine the state of
  90   our prev_rev to determine if we're an add or a change.  Without the
  91   state of the prev_rev, we are unable to distinguish between an add
  92   and a change."""
  93
  94   def __init__(self, rev, timestamp, author, state, branches):
  95     self.rev = rev
  96     self.timestamp = timestamp
  97     self.author = author
  98     self.original_timestamp = timestamp
  99     self._adjusted = False
 100     self.state = state
 101
 102     # Numbers of branch first revisions sprouting from this revision,
 103     # as specified by define_revision():
 104     self.branches = branches
 105
 106     # The revision number of the parent of this revision along the
 107     # same line of development, if any.
 108     #
 109     # For the first revision R on a branch, we consider the revision
 110     # from which R sprouted to be the 'previous'.
 111     #
 112     # Note that this revision can't be determined arithmetically (due
 113     # to cvsadmin -o, which is why this is necessary).
 114     #
 115     # If the key has no previous revision, then this field is None.
 116     self.parent = None
 117
 118     # The revision numbers of any children that depend on this revision:
 119     self.children = []
 120
 121   def adjust_timestamp(self, timestamp):
 122     self._adjusted = True
 123     self.timestamp = timestamp
 124
 125   def timestamp_was_adjusted(self):
 126     return self._adjusted
 127
 128
 129 class FileDataCollector(cvs2svn_rcsparse.Sink):
 130   """Class responsible for collecting RCS data for a particular file.
 131
 132   Any collected data that need to be remembered are stored into the
 133   referenced CollectData instance."""
 134
 135   def __init__(self, collect_data, filename):
 136     """Create an object that is prepared to receive data for FILENAME.
 137     FILENAME is the absolute filesystem path to the file in question.
 138     COLLECT_DATA is used to store the information collected about the
 139     file."""
 140
 141     self.collect_data = collect_data
 142
 143     (dirname, basename,) = os.path.split(filename)
 144     if dirname.endswith(OS_SEP_PLUS_ATTIC):
 145       # drop the 'Attic' portion from the filename for the canonical name:
 146       canonical_filename = os.path.join(
 147           dirname[:-len(OS_SEP_PLUS_ATTIC)], basename)
 148       file_in_attic = True
 149     else:
 150       canonical_filename = filename
 151       file_in_attic = False
 152
 153     # We calculate and save some file metadata here, where we can do
 154     # it only once per file, instead of waiting until later where we
 155     # would have to do the same calculations once per CVS *revision*.
 156
 157     cvs_path = Ctx().cvs_repository.get_cvs_path(canonical_filename)
 158
 159     file_stat = os.stat(filename)
 160     # The size of our file in bytes
 161     file_size = file_stat[stat.ST_SIZE]
 162
 163     # Whether or not the executable bit is set.
 164     file_executable = bool(file_stat[0] & stat.S_IXUSR)
 165
 166     # mode is not known yet, so we temporarily set it to None.
 167     self.cvs_file = CVSFile(
 168         None, filename, canonical_filename, cvs_path,
 169         file_in_attic, file_executable, file_size, None
 170         )
 171
 172     # A list [ ( name, revision) ] of each known symbol in this file
 173     # with the revision number that it corresponds to.
 174     self._symbols = []
 175
 176     # A map { revision -> c_rev } of the CVSRevision instances for all
 177     # revisions related to this file.  Note that items in this map
 178     # might be pre-filled as CVSRevisionIDs for revisions referred to
 179     # by earlier revisions but not yet processed.  As the revisions
 180     # are defined, the values are changed into CVSRevision instances.
 181     self._c_revs = {}
 182
 183     # { revision : _RevisionData instance }
 184     self._rev_data = { }
 185
 186     # A list [ revision ] of the revision numbers seen, in the order
 187     # they were given to us by rcsparse:
 188     self._rev_order = []
 189
 190     # A list [ (parent, child) ] of revision number pairs indicating
 191     # that child depends on parent.
 192     self._dependencies = []
 193
 194     # This dict is essentially self.prev_rev with the values mapped in
 195     # the other direction, so following key -> value will yield you
 196     # the next revision number.
 197     #
 198     # If the key has no next revision, then the key is not present.
 199     self.next_rev = { }
 200
 201     # Hash mapping branch numbers, like '1.7.2', to branch names,
 202     # like 'Release_1_0_dev'.
 203     self.branch_names = { }
 204
 205     # Hash mapping revision numbers, like '1.7', to lists of names
 206     # indicating which branches sprout from that revision, like
 207     # ['Release_1_0_dev', 'experimental_driver', ...].
 208     self.branchlist = { }
 209
 210     # Like self.branchlist, but the values are lists of tag names that
 211     # apply to the key revision.
 212     self.taglist = { }
 213
 214     # If set, this is an RCS branch number -- rcsparse calls this the
 215     # "principal branch", but CVS and RCS refer to it as the "default
 216     # branch", so that's what we call it, even though the rcsparse API
 217     # setter method is still 'set_principal_branch'.
 218     self.default_branch = None
 219
 220     # If the RCS file doesn't have a default branch anymore, but does
 221     # have vendor revisions, then we make an educated guess that those
 222     # revisions *were* the head of the default branch up until the
 223     # commit of 1.2, at which point the file's default branch became
 224     # trunk.  This records the date at which 1.2 was committed.
 225     self.first_non_vendor_revision_date = None
 226
 227   def _get_rev_id(self, revision):
 228     if revision is None:
 229       return None
 230     id = self._c_revs.get(revision)
 231     if id is None:
 232       id = cvs_revision.CVSRevisionID(
 233           self.collect_data.key_generator.gen_id(), self.cvs_file, revision)
 234       self._c_revs[revision] = id
 235     return id.id
 236
 237   def set_principal_branch(self, branch):
 238     """This is a callback method declared in Sink."""
 239
 240     self.default_branch = branch
 241
 242   def set_expansion(self, mode):
 243     """This is a callback method declared in Sink."""
 244
 245     self.cvs_file.mode = mode
 246
 247   def define_tag(self, name, revision):
 248     """Remember the symbol name and revision, but don't process them yet.
 249
 250     This is a callback method declared in Sink."""
 251
 252     self._symbols.append( (name, revision,) )
 253
 254   def set_branch_name(self, branch_number, name):
 255     """Record that BRANCH_NUMBER is the branch number for branch NAME,
 256     and derive and record the revision from which NAME sprouts.
 257     BRANCH_NUMBER is an RCS branch number with an odd number of
 258     components, for example '1.7.2' (never '1.7.0.2')."""
 259
 260     if self.branch_names.has_key(branch_number):
 261       sys.stderr.write("%s: in '%s':\n"
 262                        "   branch '%s' already has name '%s',\n"
 263                        "   cannot also have name '%s', ignoring the latter\n"
 264                        % (warning_prefix,
 265                           self.cvs_file.filename, branch_number,
 266                           self.branch_names[branch_number], name))
 267       return
 268
 269     self.branch_names[branch_number] = name
 270     # The branchlist is keyed on the revision number from which the
 271     # branch sprouts, so strip off the odd final component.
 272     sprout_rev = branch_number[:branch_number.rfind(".")]
 273     self.branchlist.setdefault(sprout_rev, []).append(name)
 274     self.collect_data.symbol_db.register_branch_creation(name)
 275
 276   def set_tag_name(self, revision, name):
 277     """Record that tag NAME refers to the specified REVISION."""
 278
 279     self.taglist.setdefault(revision, []).append(name)
 280     self.collect_data.symbol_db.register_tag_creation(name)
 281
 282   def rev_to_branch_name(self, revision):
 283     """Return the name of the branch on which REVISION lies.
 284     REVISION is a non-branch revision number with an even number of,
 285     components, for example '1.7.2.1' (never '1.7.2' nor '1.7.0.2').
 286     For the convenience of callers, REVISION can also be a trunk
 287     revision such as '1.2', in which case just return None."""
 288
 289     if trunk_rev.match(revision):
 290       return None
 291     return self.branch_names.get(revision[:revision.rindex(".")])
 292
 293   def _process_symbol(self, name, revision):
 294     """Record a bidirectional mapping between symbolic NAME and REVISION.
 295     REVISION is an unprocessed revision number from the RCS file's
 296     header, for example: '1.7', '1.7.0.2', or '1.1.1' or '1.1.1.1'.
 297     This function will determine what kind of symbolic name it is by
 298     inspection, and record it in the right places."""
 299
 300     m = cvs_branch_tag.match(revision)
 301     if m:
 302       self.set_branch_name(m.group(1) + m.group(2), name)
 303     elif rcs_branch_tag.match(revision):
 304       self.set_branch_name(revision, name)
 305     else:
 306       self.set_tag_name(revision, name)
 307
 308   def _transform_symbol(self, name):
 309     """Transform the symbol NAME using the renaming rules specified
 310     with --symbol-transform.  Return the transformed symbol name."""
 311
 312     for (pattern, replacement) in Ctx().symbol_transforms:
 313       newname = pattern.sub(replacement, name)
 314       if newname != name:
 315         Log().warn("   symbol '%s' transformed to '%s'" % (name, newname))
 316         name = newname
 317
 318     return name
 319
 320   def _process_symbols(self):
 321     # A list of all symbols defined for the current file.  Used to
 322     # prevent multiple definitions of a symbol, something which can
 323     # easily happen when --symbol-transform is used.
 324     defined_symbols = { }
 325
 326     for (name, revision,) in self._symbols:
 327       name = self._transform_symbol(name)
 328
 329       if defined_symbols.has_key(name):
 330         err = "%s: Multiple definitions of the symbol '%s' in '%s'" \
 331                   % (error_prefix, name, self.cvs_file.filename)
 332         sys.stderr.write(err + "\n")
 333         self.collect_data.fatal_errors.append(err)
 334
 335       defined_symbols[name] = None
 336
 337       self._process_symbol(name, revision)
 338
 339     # Free memory:
 340     self._symbols = None
 341
 342   def admin_completed(self):
 343     """This is a callback method declared in Sink."""
 344
 345     self._process_symbols()
 346     self.collect_data.add_cvs_file(self.cvs_file)
 347
 348   def define_revision(self, revision, timestamp, author, state,
 349                       branches, next):
 350     """This is a callback method declared in Sink."""
 351
 352     rev_data = _RevisionData(
 353         revision, int(timestamp), author, state, branches)
 354     self._rev_order.append(revision)
 355     self._rev_data[revision] = rev_data
 356
 357     # When on trunk, the RCS 'next' revision number points to what
 358     # humans might consider to be the 'previous' revision number.  For
 359     # example, 1.3's RCS 'next' is 1.2.
 360     #
 361     # However, on a branch, the RCS 'next' revision number really does
 362     # point to what humans would consider to be the 'next' revision
 363     # number.  For example, 1.1.2.1's RCS 'next' would be 1.1.2.2.
 364     #
 365     # In other words, in RCS, 'next' always means "where to find the next
 366     # deltatext that you need this revision to retrieve.
 367     #
 368     # That said, we don't *want* RCS's behavior here, so we determine
 369     # whether we're on trunk or a branch and set the dependencies
 370     # accordingly.
 371     #
 372     # One last thing.  Note that if REVISION is a branch revision,
 373     # instead of mapping REVISION to NEXT, we instead map NEXT to
 374     # REVISION.  Since we loop over all revisions in the file before
 375     # doing anything with the data we gather here, this 'reverse
 376     # assignment' effectively does the following:
 377     #
 378     # 1. Gives us no 'prev' value for REVISION (in this
 379     # iteration... it may have been set in a previous iteration)
 380     #
 381     # 2. Sets the 'prev' value for the revision with number NEXT to
 382     # REVISION.  So when we come around to the branch revision whose
 383     # revision value is NEXT, its 'prev' and 'prev_rev' are already
 384     # set.
 385     if next:
 386       if trunk_rev.match(revision):
 387         self._dependencies.append( (next, revision,) )
 388       else:
 389         self._dependencies.append( (revision, next,) )
 390
 391     if next:
 392       if trunk_rev.match(revision):
 393         self.next_rev[next] = revision
 394       else:
 395         self.next_rev[revision] = next
 396
 397   def _set_branch_dependencies(self, rev_data):
 398     """Set any branches sprouting from REV_DATA to depend on it."""
 399
 400     for b in rev_data.branches:
 401       self._dependencies.append( (rev_data.rev, b) )
 402
 403   def _resolve_dependencies(self):
 404     """Store the dependencies in self._dependencies into the rev_data
 405     objects."""
 406
 407     for (parent, child,) in self._dependencies:
 408       self._rev_data[parent].children.append(child)
 409       child_data = self._rev_data[child]
 410       assert child_data.parent is None
 411       child_data.parent = parent
 412
 413     # Free memory:
 414     self._dependencies = None
 415
 416   def _update_default_branch(self, rev_data):
 417     """Ratchet up the highest vendor head revision based on REV_DATA,
 418     if necessary."""
 419
 420     if self.default_branch:
 421       default_branch_root = self.default_branch + "."
 422       if (rev_data.rev.startswith(default_branch_root)
 423           and default_branch_root.count('.') == rev_data.rev.count('.')):
 424         # This revision is on the default branch, so record that it is
 425         # the new highest default branch head revision.
 426         self.collect_data.default_branches_db[self.cvs_file.cvs_path] = \
 427             rev_data.rev
 428     else:
 429       # No default branch, so make an educated guess.
 430       if rev_data.rev == '1.2':
 431         # This is probably the time when the file stopped having a
 432         # default branch, so make a note of it.
 433         self.first_non_vendor_revision_date = rev_data.timestamp
 434       else:
 435         m = vendor_revision.match(rev_data.rev)
 436         if m and ((not self.first_non_vendor_revision_date)
 437                   or (rev_data.timestamp
 438                       < self.first_non_vendor_revision_date)):
 439           # We're looking at a vendor revision, and it wasn't
 440           # committed after this file lost its default branch, so bump
 441           # the maximum trunk vendor revision in the permanent record.
 442           self.collect_data.default_branches_db[self.cvs_file.cvs_path] = \
 443               rev_data.rev
 444
 445   def _register_branch_commit(self, rev):
 446     """Register REV, which is a non-trunk revision number, as a commit
 447     on the corresponding branch."""
 448
 449     # Check for unlabeled branches, record them.  We tried to collect
 450     # all branch names when we parsed the symbolic name header
 451     # earlier, of course, but that didn't catch unlabeled branches.
 452     # If a branch is unlabeled, this is our first encounter with it,
 453     # so we have to record its data now.
 454     branch_number = rev[:rev.rindex(".")]
 455     if not self.branch_names.has_key(branch_number):
 456       branch_name = "unlabeled-" + branch_number
 457       self.set_branch_name(branch_number, branch_name)
 458
 459     # Register the commit on this non-trunk branch
 460     branch_name = self.branch_names[branch_number]
 461     self.collect_data.symbol_db.register_branch_commit(branch_name)
 462
 463   def _resync_chain(self, rev_data):
 464     """If the REV_DATA.parent revision exists and it occurred later
 465     than the REV_DATA revision, then shove the previous revision back
 466     in time (and any before it that may need to shift).  Return True
 467     iff any resyncing was done.
 468
 469     We sync backwards and not forwards because any given CVS Revision
 470     has only one previous revision.  However, a CVS Revision can *be*
 471     a previous revision for many other revisions (e.g., a revision
 472     that is the source of multiple branches).  This becomes relevant
 473     when we do the secondary synchronization in pass 2--we can make
 474     certain that we don't resync a revision earlier than its previous
 475     revision, but it would be non-trivial to make sure that we don't
 476     resync revision R *after* any revisions that have R as a previous
 477     revision."""
 478
 479     resynced = False
 480     while rev_data.parent is not None:
 481       prev_rev_data = self._rev_data[rev_data.parent]
 482
 483       if prev_rev_data.timestamp < rev_data.timestamp:
 484         # No resyncing needed here.
 485         return resynced
 486
 487       old_timestamp = prev_rev_data.timestamp
 488       prev_rev_data.adjust_timestamp(rev_data.timestamp - 1)
 489       resynced = True
 490       delta = prev_rev_data.timestamp - old_timestamp
 491       Log().verbose(
 492           "PASS1 RESYNC: '%s' (%s): old time='%s' delta=%ds"
 493           % (self.cvs_file.cvs_path, prev_rev_data.rev,
 494              time.ctime(old_timestamp), delta))
 495       if abs(delta) > config.COMMIT_THRESHOLD:
 496         Log().warn(
 497             "%s: Significant timestamp change for '%s' (%d seconds)"
 498             % (warning_prefix, self.cvs_file.cvs_path, delta))
 499       rev_data = prev_rev_data
 500
 501     return resynced
 502
 503   def tree_completed(self):
 504     """The revision tree has been parsed.  Analyze it for consistency.
 505
 506     This is a callback method declared in Sink."""
 507
 508     for rev in self._rev_order:
 509       rev_data = self._rev_data[rev]
 510
 511       self._set_branch_dependencies(rev_data)
 512
 513       self._update_default_branch(rev_data)
 514
 515       if not trunk_rev.match(rev_data.rev):
 516         self._register_branch_commit(rev_data.rev)
 517
 518     self._resolve_dependencies()
 519
 520     # Our algorithm depends upon the timestamps on the revisions occuring
 521     # monotonically over time.  That is, we want to see rev 1.34 occur in
 522     # time before rev 1.35.  If we inserted 1.35 *first* (due to the time-
 523     # sorting), and then tried to insert 1.34, we'd be screwed.
 524
 525     # To perform the analysis, we'll simply visit all of the 'previous'
 526     # links that we have recorded and validate that the timestamp on the
 527     # previous revision is before the specified revision.
 528
 529     # If we have to resync some nodes, then we restart the scan.  Just
 530     # keep looping as long as we need to restart.
 531     while True:
 532       for rev_data in self._rev_data.values():
 533         if self._resync_chain(rev_data):
 534           # Abort for loop, causing the scan to start again:
 535           break
 536       else:
 537         # Finished the for-loop without having to resync anything.
 538         # We're done.
 539         return
 540
 541   def _determine_operation(self, rev_data):
 542     # How to tell if a CVSRevision is an add, a change, or a deletion:
 543     #
 544     # It's a delete if RCS state is 'dead'
 545     #
 546     # It's an add if RCS state is 'Exp.' and
 547     #      - we either have no previous revision
 548     #        or
 549     #      - we have a previous revision whose state is 'dead'
 550     #
 551     # Anything else is a change.
 552     prev_rev_data = self._rev_data.get(rev_data.parent)
 553
 554     if rev_data.state == 'dead':
 555       op = common.OP_DELETE
 556     elif prev_rev_data is None or prev_rev_data.state == 'dead':
 557       op = common.OP_ADD
 558     else:
 559       op = common.OP_CHANGE
 560
 561     # There can be an odd situation where the tip revision of a branch
 562     # is alive, but every predecessor on the branch is in state 'dead',
 563     # yet the revision from which the branch sprouts is alive.  (This
 564     # is sort of a mirror image of the more common case of adding a
 565     # file on a branch, in which the first revision on the branch is
 566     # alive while the revision from which it sprouts is dead.)
 567     #
 568     # In this odd situation, we must mark the first live revision on
 569     # the branch as an OP_CHANGE instead of an OP_ADD, because it
 570     # reflects, however indirectly, a change w.r.t. the source
 571     # revision from which the branch sprouts.
 572     #
 573     # This is issue #89.
 574     cur_num = rev_data.rev
 575     if is_branch_revision(rev_data.rev) and rev_data.state != 'dead':
 576       while 1:
 577         prev_num = self._rev_data[cur_num].parent
 578         if not cur_num or not prev_num:
 579           break
 580         if (not is_same_line_of_development(cur_num, prev_num)
 581             and self._rev_data[cur_num].state == 'dead'
 582             and self._rev_data[prev_num].state != 'dead'):
 583           op = common.OP_CHANGE
 584         cur_num = self._rev_data[cur_num].parent
 585
 586     return op
 587
 588   def set_revision_info(self, revision, log, text):
 589     """This is a callback method declared in Sink."""
 590
 591     rev_data = self._rev_data[revision]
 592     digest = sha.new(log + '\0' + rev_data.author).hexdigest()
 593     if rev_data.timestamp_was_adjusted():
 594       # the timestamp on this revision was changed. log it for later
 595       # resynchronization of other files's revisions that occurred
 596       # for this time and log message.
 597       self.collect_data.resync.write(
 598           '%08lx %s %08lx\n'
 599           % (rev_data.original_timestamp, digest, rev_data.timestamp))
 600
 601     # "...Give back one kadam to honor the Hebrew God whose Ark this is."
 602     #       -- Imam to Indy and Sallah, in 'Raiders of the Lost Ark'
 603     #
 604     # If revision 1.1 appears to have been created via 'cvs add'
 605     # instead of 'cvs import', then this file probably never had a
 606     # default branch, so retroactively remove its record in the
 607     # default branches db.  The test is that the log message CVS uses
 608     # for 1.1 in imports is "Initial revision\n" with no period.
 609     if revision == '1.1' and log != 'Initial revision\n':
 610       try:
 611         del self.collect_data.default_branches_db[self.cvs_file.cvs_path]
 612       except KeyError:
 613         pass
 614
 615     # Get the timestamps of the previous and next revisions
 616     prev_rev = rev_data.parent
 617     prev_rev_data = self._rev_data.get(prev_rev)
 618     if prev_rev_data is None:
 619       prev_timestamp = 0
 620     else:
 621       prev_timestamp = prev_rev_data.timestamp
 622
 623     next_rev = self.next_rev.get(revision)
 624     next_rev_data = self._rev_data.get(next_rev)
 625     if next_rev_data is None:
 626       next_timestamp = 0
 627     else:
 628       next_timestamp = next_rev_data.timestamp
 629
 630     c_rev = cvs_revision.CVSRevision(
 631         self._get_rev_id(revision), self.cvs_file,
 632         rev_data.timestamp, digest,
 633         self._get_rev_id(prev_rev), self._get_rev_id(next_rev),
 634         prev_timestamp, next_timestamp, self._determine_operation(rev_data),
 635         prev_rev, revision, next_rev,
 636         bool(text),
 637         self.rev_to_branch_name(revision),
 638         self.taglist.get(revision, []), self.branchlist.get(revision, []))
 639     self._c_revs[revision] = c_rev
 640     self.collect_data.add_cvs_revision(c_rev)
 641
 642     if not self.collect_data.metadata_db.has_key(digest):
 643       self.collect_data.metadata_db[digest] = (rev_data.author, log)
 644
 645   def parse_completed(self):
 646     """Walk through all branches and tags and register them with their
 647     parent branch in the symbol database.
 648
 649     This is a callback method declared in Sink."""
 650
 651     for revision, symbols in self.taglist.items() + self.branchlist.items():
 652       for symbol in symbols:
 653         name = self.rev_to_branch_name(revision)
 654         if name is not None:
 655           self.collect_data.symbol_db.register_branch_blocker(name, symbol)
 656
 657     self.collect_data.num_files += 1
 658
 659
 660 class CollectData:
 661   """Repository for data collected by parsing the CVS repository files.
 662
 663   This class manages the databases into which information collected
 664   from the CVS repository is stored.  The data are stored into this
 665   class by FileDataCollector instances, one of which is created for
 666   each file to be parsed."""
 667
 668   def __init__(self):
 669     self._cvs_file_db = CVSFileDatabase(
 670         artifact_manager.get_temp_file(config.CVS_FILES_DB),
 671         database.DB_OPEN_NEW)
 672     self._cvs_revs_db = CVSRevisionDatabase(
 673         self._cvs_file_db,
 674         artifact_manager.get_temp_file(config.CVS_REVS_DB),
 675         database.DB_OPEN_NEW)
 676     self._all_revs = open(
 677         artifact_manager.get_temp_file(config.ALL_REVS_DATAFILE), 'w')
 678     self.resync = open(
 679         artifact_manager.get_temp_file(config.RESYNC_DATAFILE), 'w')
 680     self.default_branches_db = database.SDatabase(
 681         artifact_manager.get_temp_file(config.DEFAULT_BRANCHES_DB),
 682         database.DB_OPEN_NEW)
 683     self.metadata_db = database.Database(
 684         artifact_manager.get_temp_file(config.METADATA_DB),
 685         database.DB_OPEN_NEW)
 686     self.fatal_errors = []
 687     self.num_files = 0
 688     self.symbol_db = symbol_database.SymbolDatabase()
 689
 690     # 1 if we've collected data for at least one file, None otherwise.
 691     self.found_valid_file = None
 692
 693     # Key generator to generate unique keys for each CVSFile object:
 694     self.file_key_generator = KeyGenerator(1)
 695
 696     # Key generator to generate unique keys for each CVSRevision object:
 697     self.key_generator = KeyGenerator()
 698
 699   def add_cvs_file(self, cvs_file):
 700     """If CVS_FILE is not already stored to _cvs_revs_db, give it a
 701     persistent id and store it now.  The way we tell whether it was
 702     already stored is by whether it already has a non-None id."""
 703
 704     assert cvs_file.id is None
 705     cvs_file.id = self.file_key_generator.gen_id()
 706     self._cvs_file_db.log_file(cvs_file)
 707
 708   def add_cvs_revision(self, c_rev):
 709     self._cvs_revs_db.log_revision(c_rev)
 710     self._all_revs.write('%s\n' % (c_rev.unique_key(),))
 711     StatsKeeper().record_c_rev(c_rev)
 712
 713   def write_symbol_db(self):
 714     self.symbol_db.write()
 715
 716