cvs2svn_lib/collect_data.py

   1 # (Be in -*- python -*- mode.)
   2 #
   3 # ====================================================================
   4 # Copyright (c) 2000-2006 CollabNet.  All rights reserved.
   5 #
   6 # This software is licensed as described in the file COPYING, which
   7 # you should have received as part of this distribution.  The terms
   8 # are also available at http://subversion.tigris.org/license-1.html.
   9 # If newer versions of this license are posted there, you may use a
  10 # newer version instead, at your option.
  11 #
  12 # This software consists of voluntary contributions made by many
  13 # individuals.  For exact contribution history, see the revision
  14 # history and logs, available at http://cvs2svn.tigris.org/.
  15 # ====================================================================
  16
  17 """This module contains database facilities used by cvs2svn."""
  18
  19
  20 from __future__ import generators
  21
  22 import sys
  23 import os
  24 import re
  25 import time
  26 import sha
  27 import stat
  28
  29 from boolean import *
  30 import common
  31 from common import warning_prefix
  32 from common import error_prefix
  33 import config
  34 from log import Log
  35 from context import Ctx
  36 from artifact_manager import artifact_manager
  37 from cvs_file import CVSFile
  38 import cvs_revision
  39 from stats_keeper import StatsKeeper
  40 from key_generator import KeyGenerator
  41 import database
  42 from cvs_file_database import CVSFileDatabase
  43 from cvs_revision_database import CVSRevisionDatabase
  44 import symbol_database
  45 import cvs2svn_rcsparse
  46
  47
  48 OS_SEP_PLUS_ATTIC = os.sep + 'Attic'
  49
  50 trunk_rev = re.compile(r'^[0-9]+\.[0-9]+$')
  51 cvs_branch_tag = re.compile(r'^((?:[0-9]+\.[0-9]+\.)+)0\.([0-9]+)$')
  52 rcs_branch_tag = re.compile(r'^(?:[0-9]+\.[0-9]+\.)+[0-9]+$')
  53
  54 # This really only matches standard '1.1.1.*'-style vendor revisions.
  55 # One could conceivably have a file whose default branch is 1.1.3 or
  56 # whatever, or was that at some point in time, with vendor revisions
  57 # 1.1.3.1, 1.1.3.2, etc.  But with the default branch gone now (which
  58 # is the only time this regexp gets used), we'd have no basis for
  59 # assuming that the non-standard vendor branch had ever been the
  60 # default branch anyway, so we don't want this to match them anyway.
  61 vendor_revision = re.compile(r'^(1\.1\.1)\.([0-9])+$')
  62
  63
  64 class _RevisionData:
  65   """We track the state of each revision so that in set_revision_info,
  66   we can determine if our op is an add/change/delete.  We can do this
  67   because in set_revision_info, we'll have all of the _RevisionData
  68   for a file at our fingertips, and we need to examine the state of
  69   our prev_rev to determine if we're an add or a change.  Without the
  70   state of the prev_rev, we are unable to distinguish between an add
  71   and a change."""
  72
  73   def __init__(self, timestamp, author, state):
  74     self.timestamp = timestamp
  75     self.author = author
  76     self.original_timestamp = timestamp
  77     self._adjusted = False
  78     self.state = state
  79
  80   def adjust_timestamp(self, timestamp):
  81     self._adjusted = True
  82     self.timestamp = timestamp
  83
  84   def timestamp_was_adjusted(self):
  85     return self._adjusted
  86
  87
  88 class FileDataCollector(cvs2svn_rcsparse.Sink):
  89   """Class responsible for collecting RCS data for a particular file.
  90
  91   Any collected data that need to be remembered are stored into the
  92   referenced CollectData instance."""
  93
  94   def __init__(self, collect_data, filename):
  95     """Create an object that is prepared to receive data for FILENAME.
  96     FILENAME is the absolute filesystem path to the file in question.
  97     COLLECT_DATA is used to store the information collected about the
  98     file."""
  99
 100     self.collect_data = collect_data
 101
 102     (dirname, basename,) = os.path.split(filename)
 103     if dirname.endswith(OS_SEP_PLUS_ATTIC):
 104       # drop the 'Attic' portion from the filename for the canonical name:
 105       canonical_filename = os.path.join(
 106           dirname[:-len(OS_SEP_PLUS_ATTIC)], basename)
 107       file_in_attic = True
 108     else:
 109       canonical_filename = filename
 110       file_in_attic = False
 111
 112     # We calculate and save some file metadata here, where we can do
 113     # it only once per file, instead of waiting until later where we
 114     # would have to do the same calculations once per CVS *revision*.
 115
 116     cvs_path = Ctx().cvs_repository.get_cvs_path(canonical_filename)
 117
 118     file_stat = os.stat(filename)
 119     # The size of our file in bytes
 120     file_size = file_stat[stat.ST_SIZE]
 121
 122     # Whether or not the executable bit is set.
 123     file_executable = bool(file_stat[0] & stat.S_IXUSR)
 124
 125     # mode is not known yet, so we temporarily set it to None.
 126     self.cvs_file = CVSFile(
 127         None, filename, canonical_filename, cvs_path,
 128         file_in_attic, file_executable, file_size, None
 129         )
 130
 131     # A map { revision -> c_rev } of the CVSRevision instances for all
 132     # revisions related to this file.  Note that items in this map
 133     # might be pre-filled as CVSRevisionIDs for revisions referred to
 134     # by earlier revisions but not yet processed.  As the revisions
 135     # are defined, the values are changed into CVSRevision instances.
 136     self._c_revs = {}
 137
 138     # { revision : _RevisionData instance }
 139     self._rev_data = { }
 140
 141     # Maps revision number (key) to the revision number of the
 142     # previous revision along this line of development.
 143     #
 144     # For the first revision R on a branch, we consider the revision
 145     # from which R sprouted to be the 'previous'.
 146     #
 147     # Note that this revision can't be determined arithmetically (due
 148     # to cvsadmin -o, which is why this is necessary).
 149     #
 150     # If the key has no previous revision, then store None as key's
 151     # value.
 152     self.prev_rev = { }
 153
 154     # This dict is essentially self.prev_rev with the values mapped in
 155     # the other direction, so following key -> value will yield you
 156     # the next revision number.
 157     #
 158     # Unlike self.prev_rev, if the key has no next revision, then the
 159     # key is not present.
 160     self.next_rev = { }
 161
 162     # Hash mapping branch numbers, like '1.7.2', to branch names,
 163     # like 'Release_1_0_dev'.
 164     self.branch_names = { }
 165
 166     # Hash mapping revision numbers, like '1.7', to lists of names
 167     # indicating which branches sprout from that revision, like
 168     # ['Release_1_0_dev', 'experimental_driver', ...].
 169     self.branchlist = { }
 170
 171     # Like self.branchlist, but the values are lists of tag names that
 172     # apply to the key revision.
 173     self.taglist = { }
 174
 175     # If set, this is an RCS branch number -- rcsparse calls this the
 176     # "principal branch", but CVS and RCS refer to it as the "default
 177     # branch", so that's what we call it, even though the rcsparse API
 178     # setter method is still 'set_principal_branch'.
 179     self.default_branch = None
 180
 181     # If the RCS file doesn't have a default branch anymore, but does
 182     # have vendor revisions, then we make an educated guess that those
 183     # revisions *were* the head of the default branch up until the
 184     # commit of 1.2, at which point the file's default branch became
 185     # trunk.  This records the date at which 1.2 was committed.
 186     self.first_non_vendor_revision_date = None
 187
 188     # A list of all symbols defined for the current file.  Used to
 189     # prevent multiple definitions of a symbol, something which can
 190     # easily happen when --symbol-transform is used.
 191     self.defined_symbols = { }
 192
 193   def _get_rev_id(self, revision):
 194     if revision is None:
 195       return None
 196     id = self._c_revs.get(revision)
 197     if id is None:
 198       id = cvs_revision.CVSRevisionID(
 199           self.collect_data.key_generator.gen_id(), self.cvs_file, revision)
 200       self._c_revs[revision] = id
 201     return id.id
 202
 203   def set_principal_branch(self, branch):
 204     """This is a callback method declared in Sink."""
 205
 206     self.default_branch = branch
 207
 208   def set_expansion(self, mode):
 209     """This is a callback method declared in Sink."""
 210
 211     self.cvs_file.mode = mode
 212
 213   def set_branch_name(self, branch_number, name):
 214     """Record that BRANCH_NUMBER is the branch number for branch NAME,
 215     and derive and record the revision from which NAME sprouts.
 216     BRANCH_NUMBER is an RCS branch number with an odd number of
 217     components, for example '1.7.2' (never '1.7.0.2')."""
 218
 219     if self.branch_names.has_key(branch_number):
 220       sys.stderr.write("%s: in '%s':\n"
 221                        "   branch '%s' already has name '%s',\n"
 222                        "   cannot also have name '%s', ignoring the latter\n"
 223                        % (warning_prefix,
 224                           self.cvs_file.filename, branch_number,
 225                           self.branch_names[branch_number], name))
 226       return
 227
 228     self.branch_names[branch_number] = name
 229     # The branchlist is keyed on the revision number from which the
 230     # branch sprouts, so strip off the odd final component.
 231     sprout_rev = branch_number[:branch_number.rfind(".")]
 232     self.branchlist.setdefault(sprout_rev, []).append(name)
 233     self.collect_data.symbol_db.register_branch_creation(name)
 234
 235   def set_tag_name(self, revision, name):
 236     """Record that tag NAME refers to the specified REVISION."""
 237
 238     self.taglist.setdefault(revision, []).append(name)
 239     self.collect_data.symbol_db.register_tag_creation(name)
 240
 241   def rev_to_branch_name(self, revision):
 242     """Return the name of the branch on which REVISION lies.
 243     REVISION is a non-branch revision number with an even number of,
 244     components, for example '1.7.2.1' (never '1.7.2' nor '1.7.0.2').
 245     For the convenience of callers, REVISION can also be a trunk
 246     revision such as '1.2', in which case just return None."""
 247
 248     if trunk_rev.match(revision):
 249       return None
 250     return self.branch_names.get(revision[:revision.rindex(".")])
 251
 252   def define_tag(self, name, revision):
 253     """Record a bidirectional mapping between symbolic NAME and REVISION.
 254     REVISION is an unprocessed revision number from the RCS file's
 255     header, for example: '1.7', '1.7.0.2', or '1.1.1' or '1.1.1.1'.
 256     This function will determine what kind of symbolic name it is by
 257     inspection, and record it in the right places.
 258
 259     This is a callback method declared in Sink."""
 260
 261     for (pattern, replacement) in Ctx().symbol_transforms:
 262       newname = pattern.sub(replacement, name)
 263       if newname != name:
 264         Log().warn("   symbol '%s' transformed to '%s'" % (name, newname))
 265         name = newname
 266
 267     if self.defined_symbols.has_key(name):
 268       err = "%s: Multiple definitions of the symbol '%s' in '%s'" \
 269                 % (error_prefix, name, self.cvs_file.filename)
 270       sys.stderr.write(err + "\n")
 271       self.collect_data.fatal_errors.append(err)
 272
 273     self.defined_symbols[name] = None
 274
 275     m = cvs_branch_tag.match(revision)
 276     if m:
 277       self.set_branch_name(m.group(1) + m.group(2), name)
 278     elif rcs_branch_tag.match(revision):
 279       self.set_branch_name(revision, name)
 280     else:
 281       self.set_tag_name(revision, name)
 282
 283   def admin_completed(self):
 284     """This is a callback method declared in Sink."""
 285
 286     self.collect_data.add_cvs_file(self.cvs_file)
 287
 288   def define_revision(self, revision, timestamp, author, state,
 289                       branches, next):
 290     """This is a callback method declared in Sink."""
 291
 292     # store the rev_data as a list in case we have to jigger the timestamp
 293     self._rev_data[revision] = _RevisionData(int(timestamp), author, state)
 294
 295     # When on trunk, the RCS 'next' revision number points to what
 296     # humans might consider to be the 'previous' revision number.  For
 297     # example, 1.3's RCS 'next' is 1.2.
 298     #
 299     # However, on a branch, the RCS 'next' revision number really does
 300     # point to what humans would consider to be the 'next' revision
 301     # number.  For example, 1.1.2.1's RCS 'next' would be 1.1.2.2.
 302     #
 303     # In other words, in RCS, 'next' always means "where to find the next
 304     # deltatext that you need this revision to retrieve.
 305     #
 306     # That said, we don't *want* RCS's behavior here, so we determine
 307     # whether we're on trunk or a branch and set self.prev_rev
 308     # accordingly.
 309     #
 310     # One last thing.  Note that if REVISION is a branch revision,
 311     # instead of mapping REVISION to NEXT, we instead map NEXT to
 312     # REVISION.  Since we loop over all revisions in the file before
 313     # doing anything with the data we gather here, this 'reverse
 314     # assignment' effectively does the following:
 315     #
 316     # 1. Gives us no 'prev' value for REVISION (in this
 317     # iteration... it may have been set in a previous iteration)
 318     #
 319     # 2. Sets the 'prev' value for the revision with number NEXT to
 320     # REVISION.  So when we come around to the branch revision whose
 321     # revision value is NEXT, its 'prev' and 'prev_rev' are already
 322     # set.
 323     if trunk_rev.match(revision):
 324       self.prev_rev[revision] = next
 325       self.next_rev[next] = revision
 326     elif next:
 327       self.prev_rev[next] = revision
 328       self.next_rev[revision] = next
 329
 330     for b in branches:
 331       self.prev_rev[b] = revision
 332
 333     # Ratchet up the highest vendor head revision, if necessary.
 334     if self.default_branch:
 335       default_branch_root = self.default_branch + "."
 336       if (revision.startswith(default_branch_root)
 337           and default_branch_root.count('.') == revision.count('.')):
 338         # This revision is on the default branch, so record that it is
 339         # the new highest default branch head revision.
 340         self.collect_data.default_branches_db[self.cvs_file.cvs_path] = \
 341             revision
 342     else:
 343       # No default branch, so make an educated guess.
 344       if revision == '1.2':
 345         # This is probably the time when the file stopped having a
 346         # default branch, so make a note of it.
 347         self.first_non_vendor_revision_date = timestamp
 348       else:
 349         m = vendor_revision.match(revision)
 350         if m and ((not self.first_non_vendor_revision_date)
 351                   or (timestamp < self.first_non_vendor_revision_date)):
 352           # We're looking at a vendor revision, and it wasn't
 353           # committed after this file lost its default branch, so bump
 354           # the maximum trunk vendor revision in the permanent record.
 355           self.collect_data.default_branches_db[self.cvs_file.cvs_path] = \
 356               revision
 357
 358     if not trunk_rev.match(revision):
 359       # Check for unlabeled branches, record them.  We tried to collect
 360       # all branch names when we parsed the symbolic name header
 361       # earlier, of course, but that didn't catch unlabeled branches.
 362       # If a branch is unlabeled, this is our first encounter with it,
 363       # so we have to record its data now.
 364       branch_number = revision[:revision.rindex(".")]
 365       if not self.branch_names.has_key(branch_number):
 366         branch_name = "unlabeled-" + branch_number
 367         self.set_branch_name(branch_number, branch_name)
 368
 369       # Register the commit on this non-trunk branch
 370       branch_name = self.branch_names[branch_number]
 371       self.collect_data.symbol_db.register_branch_commit(branch_name)
 372
 373   def _resync_chain(self, current, prev):
 374     """If the PREV revision exists and it occurred later than the
 375     CURRENT revision, then shove the previous revision back in time
 376     (and any before it that may need to shift).  Return True iff any
 377     resyncing was done.
 378
 379     We sync backwards and not forwards because any given CVS Revision
 380     has only one previous revision.  However, a CVS Revision can *be*
 381     a previous revision for many other revisions (e.g., a revision
 382     that is the source of multiple branches).  This becomes relevant
 383     when we do the secondary synchronization in pass 2--we can make
 384     certain that we don't resync a revision earlier than its previous
 385     revision, but it would be non-trivial to make sure that we don't
 386     resync revision R *after* any revisions that have R as a previous
 387     revision."""
 388
 389     resynced = False
 390     while prev is not None:
 391       current_rev_data = self._rev_data[current]
 392       prev_rev_data = self._rev_data[prev]
 393
 394       if prev_rev_data.timestamp < current_rev_data.timestamp:
 395         # No resyncing needed here.
 396         return resynced
 397
 398       old_timestamp = prev_rev_data.timestamp
 399       prev_rev_data.adjust_timestamp(current_rev_data.timestamp - 1)
 400       resynced = True
 401       delta = prev_rev_data.timestamp - old_timestamp
 402       Log().verbose(
 403           "PASS1 RESYNC: '%s' (%s): old time='%s' delta=%ds"
 404           % (self.cvs_file.cvs_path, prev,
 405              time.ctime(old_timestamp), delta))
 406       if abs(delta) > config.COMMIT_THRESHOLD:
 407         Log().warn(
 408             "%s: Significant timestamp change for '%s' (%d seconds)"
 409             % (warning_prefix, self.cvs_file.cvs_path, delta))
 410       current = prev
 411       prev = self.prev_rev[current]
 412
 413     return resynced
 414
 415   def tree_completed(self):
 416     """The revision tree has been parsed.  Analyze it for consistency.
 417
 418     This is a callback method declared in Sink."""
 419
 420     # Our algorithm depends upon the timestamps on the revisions occuring
 421     # monotonically over time.  That is, we want to see rev 1.34 occur in
 422     # time before rev 1.35.  If we inserted 1.35 *first* (due to the time-
 423     # sorting), and then tried to insert 1.34, we'd be screwed.
 424
 425     # To perform the analysis, we'll simply visit all of the 'previous'
 426     # links that we have recorded and validate that the timestamp on the
 427     # previous revision is before the specified revision.
 428
 429     # If we have to resync some nodes, then we restart the scan.  Just
 430     # keep looping as long as we need to restart.
 431     while True:
 432       for current, prev in self.prev_rev.items():
 433         if self._resync_chain(current, prev):
 434           # Abort for loop, causing the scan to start again:
 435           break
 436       else:
 437         # Finished the for-loop without having to resync anything.
 438         # We're done.
 439         return
 440
 441   def set_revision_info(self, revision, log, text):
 442     """This is a callback method declared in Sink."""
 443
 444     rev_data = self._rev_data[revision]
 445     digest = sha.new(log + '\0' + rev_data.author).hexdigest()
 446     if rev_data.timestamp_was_adjusted():
 447       # the timestamp on this revision was changed. log it for later
 448       # resynchronization of other files's revisions that occurred
 449       # for this time and log message.
 450       self.collect_data.resync.write(
 451           '%08lx %s %08lx\n'
 452           % (rev_data.original_timestamp, digest, rev_data.timestamp))
 453
 454     # "...Give back one kadam to honor the Hebrew God whose Ark this is."
 455     #       -- Imam to Indy and Sallah, in 'Raiders of the Lost Ark'
 456     #
 457     # If revision 1.1 appears to have been created via 'cvs add'
 458     # instead of 'cvs import', then this file probably never had a
 459     # default branch, so retroactively remove its record in the
 460     # default branches db.  The test is that the log message CVS uses
 461     # for 1.1 in imports is "Initial revision\n" with no period.
 462     if revision == '1.1' and log != 'Initial revision\n':
 463       try:
 464         del self.collect_data.default_branches_db[self.cvs_file.cvs_path]
 465       except KeyError:
 466         pass
 467
 468     # Get the timestamps of the previous and next revisions
 469     prev_rev = self.prev_rev[revision]
 470     prev_rev_data = self._rev_data.get(prev_rev)
 471     if prev_rev_data is None:
 472       prev_timestamp = 0
 473     else:
 474       prev_timestamp = prev_rev_data.timestamp
 475
 476     next_rev = self.next_rev.get(revision)
 477     next_rev_data = self._rev_data.get(next_rev)
 478     if next_rev_data is None:
 479       next_timestamp = 0
 480     else:
 481       next_timestamp = next_rev_data.timestamp
 482
 483     # How to tell if a CVSRevision is an add, a change, or a deletion:
 484     #
 485     # It's a delete if RCS state is 'dead'
 486     #
 487     # It's an add if RCS state is 'Exp.' and
 488     #      - we either have no previous revision
 489     #        or
 490     #      - we have a previous revision whose state is 'dead'
 491     #
 492     # Anything else is a change.
 493     if rev_data.state == 'dead':
 494       op = common.OP_DELETE
 495     elif prev_rev_data is None or prev_rev_data.state == 'dead':
 496       op = common.OP_ADD
 497     else:
 498       op = common.OP_CHANGE
 499
 500     def is_branch_revision(rev):
 501       """Return True if this revision is not a trunk revision,
 502       else return False."""
 503
 504       if rev.count('.') >= 3:
 505         return True
 506       return False
 507
 508     def is_same_line_of_development(rev1, rev2):
 509       """Return True if rev1 and rev2 are on the same line of
 510       development (i.e., both on trunk, or both on the same branch);
 511       return False otherwise.  Either rev1 or rev2 can be None, in
 512       which case automatically return False."""
 513
 514       if rev1 is None or rev2 is None:
 515         return False
 516       if rev1.count('.') == 1 and rev2.count('.') == 1:
 517         return True
 518       if rev1[0:rev1.rfind('.')] == rev2[0:rev2.rfind('.')]:
 519         return True
 520       return False
 521
 522     # There can be an odd situation where the tip revision of a branch
 523     # is alive, but every predecessor on the branch is in state 'dead',
 524     # yet the revision from which the branch sprouts is alive.  (This
 525     # is sort of a mirror image of the more common case of adding a
 526     # file on a branch, in which the first revision on the branch is
 527     # alive while the revision from which it sprouts is dead.)
 528     #
 529     # In this odd situation, we must mark the first live revision on
 530     # the branch as an OP_CHANGE instead of an OP_ADD, because it
 531     # reflects, however indirectly, a change w.r.t. the source
 532     # revision from which the branch sprouts.
 533     #
 534     # This is issue #89.
 535     cur_num = revision
 536     if is_branch_revision(revision) and rev_data.state != 'dead':
 537       while 1:
 538         prev_num = self.prev_rev.get(cur_num, None)
 539         if not cur_num or not prev_num:
 540           break
 541         if (not is_same_line_of_development(cur_num, prev_num)
 542             and self._rev_data[cur_num].state == 'dead'
 543             and self._rev_data[prev_num].state != 'dead'):
 544           op = common.OP_CHANGE
 545         cur_num = self.prev_rev.get(cur_num, None)
 546
 547     c_rev = cvs_revision.CVSRevision(
 548         self._get_rev_id(revision), self.cvs_file,
 549         rev_data.timestamp, digest,
 550         self._get_rev_id(prev_rev), self._get_rev_id(next_rev),
 551         prev_timestamp, next_timestamp, op,
 552         prev_rev, revision, next_rev,
 553         bool(text),
 554         self.rev_to_branch_name(revision),
 555         self.taglist.get(revision, []), self.branchlist.get(revision, []))
 556     self._c_revs[revision] = c_rev
 557     self.collect_data.add_cvs_revision(c_rev)
 558
 559     if not self.collect_data.metadata_db.has_key(digest):
 560       self.collect_data.metadata_db[digest] = (rev_data.author, log)
 561
 562   def parse_completed(self):
 563     """Walk through all branches and tags and register them with their
 564     parent branch in the symbol database.
 565
 566     This is a callback method declared in Sink."""
 567
 568     for revision, symbols in self.taglist.items() + self.branchlist.items():
 569       for symbol in symbols:
 570         name = self.rev_to_branch_name(revision)
 571         if name is not None:
 572           self.collect_data.symbol_db.register_branch_blocker(name, symbol)
 573
 574     self.collect_data.num_files += 1
 575
 576
 577 class CollectData:
 578   """Repository for data collected by parsing the CVS repository files.
 579
 580   This class manages the databases into which information collected
 581   from the CVS repository is stored.  The data are stored into this
 582   class by FileDataCollector instances, one of which is created for
 583   each file to be parsed."""
 584
 585   def __init__(self):
 586     self._cvs_file_db = CVSFileDatabase(
 587         artifact_manager.get_temp_file(config.CVS_FILES_DB),
 588         database.DB_OPEN_NEW)
 589     self._cvs_revs_db = CVSRevisionDatabase(
 590         self._cvs_file_db,
 591         artifact_manager.get_temp_file(config.CVS_REVS_DB),
 592         database.DB_OPEN_NEW)
 593     self._all_revs = open(
 594         artifact_manager.get_temp_file(config.ALL_REVS_DATAFILE), 'w')
 595     self.resync = open(
 596         artifact_manager.get_temp_file(config.RESYNC_DATAFILE), 'w')
 597     self.default_branches_db = database.SDatabase(
 598         artifact_manager.get_temp_file(config.DEFAULT_BRANCHES_DB),
 599         database.DB_OPEN_NEW)
 600     self.metadata_db = database.Database(
 601         artifact_manager.get_temp_file(config.METADATA_DB),
 602         database.DB_OPEN_NEW)
 603     self.fatal_errors = []
 604     self.num_files = 0
 605     self.symbol_db = symbol_database.SymbolDatabase()
 606
 607     # 1 if we've collected data for at least one file, None otherwise.
 608     self.found_valid_file = None
 609
 610     # Key generator to generate unique keys for each CVSFile object:
 611     self.file_key_generator = KeyGenerator(1)
 612
 613     # Key generator to generate unique keys for each CVSRevision object:
 614     self.key_generator = KeyGenerator()
 615
 616   def add_cvs_file(self, cvs_file):
 617     """If CVS_FILE is not already stored to _cvs_revs_db, give it a
 618     persistent id and store it now.  The way we tell whether it was
 619     already stored is by whether it already has a non-None id."""
 620
 621     assert cvs_file.id is None
 622     cvs_file.id = self.file_key_generator.gen_id()
 623     self._cvs_file_db.log_file(cvs_file)
 624
 625   def add_cvs_revision(self, c_rev):
 626     self._cvs_revs_db.log_revision(c_rev)
 627     self._all_revs.write('%s\n' % (c_rev.unique_key(),))
 628     StatsKeeper().record_c_rev(c_rev)
 629
 630   def write_symbol_db(self):
 631     self.symbol_db.write()
 632
 633