cvs2svn_lib/collect_data.py

   1 # (Be in -*- python -*- mode.)
   2 #
   3 # ====================================================================
   4 # Copyright (c) 2000-2009 CollabNet.  All rights reserved.
   5 #
   6 # This software is licensed as described in the file COPYING, which
   7 # you should have received as part of this distribution.  The terms
   8 # are also available at http://subversion.tigris.org/license-1.html.
   9 # If newer versions of this license are posted there, you may use a
  10 # newer version instead, at your option.
  11 #
  12 # This software consists of voluntary contributions made by many
  13 # individuals.  For exact contribution history, see the revision
  14 # history and logs, available at http://cvs2svn.tigris.org/.
  15 # ====================================================================
  16
  17 """Data collection classes.
  18
  19 This module contains the code used to collect data from the CVS
  20 repository.  It parses *,v files, recording all useful information
  21 except for the actual file contents.
  22
  23 As a *,v file is parsed, the information pertaining to the file is
  24 accumulated in memory, mostly in _RevisionData, _BranchData, and
  25 _TagData objects.  When parsing is complete, a final pass is made over
  26 the data to create some final dependency links, collect statistics,
  27 etc., then the _*Data objects are converted into CVSItem objects
  28 (CVSRevision, CVSBranch, and CVSTag respectively) and the CVSItems are
  29 dumped into databases.
  30
  31 During the data collection, persistent unique ids are allocated to
  32 many types of objects: CVSFile, Symbol, and CVSItems.  CVSItems are a
  33 special case.  CVSItem ids are unique across all CVSItem types, and
  34 the ids are carried over from the corresponding data collection
  35 objects:
  36
  37     _RevisionData -> CVSRevision
  38
  39     _BranchData -> CVSBranch
  40
  41     _TagData -> CVSTag
  42
  43 In a later pass it is possible to convert tags <-> branches.  But even
  44 if this occurs, the new branch or tag uses the same id as the old tag
  45 or branch.
  46
  47 """
  48
  49
  50 import re
  51
  52 from cvs2svn_lib import config
  53 from cvs2svn_lib.common import DB_OPEN_NEW
  54 from cvs2svn_lib.common import warning_prefix
  55 from cvs2svn_lib.common import error_prefix
  56 from cvs2svn_lib.common import is_trunk_revision
  57 from cvs2svn_lib.common import is_branch_revision_number
  58 from cvs2svn_lib.log import logger
  59 from cvs2svn_lib.context import Ctx
  60 from cvs2svn_lib.artifact_manager import artifact_manager
  61 from cvs2svn_lib.cvs_path import CVSPath
  62 from cvs2svn_lib.cvs_path import CVSFile
  63 from cvs2svn_lib.cvs_path import CVSDirectory
  64 from cvs2svn_lib.symbol import Symbol
  65 from cvs2svn_lib.symbol import Trunk
  66 from cvs2svn_lib.cvs_item import CVSRevision
  67 from cvs2svn_lib.cvs_item import CVSBranch
  68 from cvs2svn_lib.cvs_item import CVSTag
  69 from cvs2svn_lib.cvs_item import cvs_revision_type_map
  70 from cvs2svn_lib.cvs_file_items import VendorBranchError
  71 from cvs2svn_lib.cvs_file_items import CVSFileItems
  72 from cvs2svn_lib.key_generator import KeyGenerator
  73 from cvs2svn_lib.cvs_item_database import NewCVSItemStore
  74 from cvs2svn_lib.symbol_statistics import SymbolStatisticsCollector
  75 from cvs2svn_lib.metadata_database import MetadataDatabase
  76 from cvs2svn_lib.metadata_database import MetadataLogger
  77 from cvs2svn_lib.repository_walker import walk_repository
  78
  79 import cvs2svn_rcsparse
  80
  81
  82 # A regular expression defining "valid" revision numbers (used to
  83 # check that symbol definitions are reasonable).
  84 _valid_revision_re = re.compile(r'''
  85     ^
  86     (?:\d+\.)+          # Digit groups with trailing dots
  87     \d+                 # And the last digit group.
  88     $
  89     ''', re.VERBOSE)
  90
  91 _branch_revision_re = re.compile(r'''
  92     ^
  93     ((?:\d+\.\d+\.)+)   # A nonzero even number of digit groups w/trailing dot
  94     (?:0\.)?            # CVS sticks an extra 0 here; RCS does not
  95     (\d+)               # And the last digit group
  96     $
  97     ''', re.VERBOSE)
  98
  99
 100 def is_same_line_of_development(rev1, rev2):
 101   """Return True if rev1 and rev2 are on the same line of
 102   development (i.e., both on trunk, or both on the same branch);
 103   return False otherwise.  Either rev1 or rev2 can be None, in
 104   which case automatically return False."""
 105
 106   if rev1 is None or rev2 is None:
 107     return False
 108   if is_trunk_revision(rev1) and is_trunk_revision(rev2):
 109     # Trunk revisions have to be handled specially because the main
 110     # trunk version number can be changed; e.g., from 1 to 2.
 111     return True
 112   if rev1[0:rev1.rfind('.')] == rev2[0:rev2.rfind('.')]:
 113     return True
 114   return False
 115
 116
 117 class _RevisionData:
 118   """We track the state of each revision so that in set_revision_info,
 119   we can determine if our op is an add/change/delete.  We can do this
 120   because in set_revision_info, we'll have all of the _RevisionData
 121   for a file at our fingertips, and we need to examine the state of
 122   our prev_rev to determine if we're an add or a change.  Without the
 123   state of the prev_rev, we are unable to distinguish between an add
 124   and a change."""
 125
 126   def __init__(self, cvs_rev_id, rev, timestamp, author, state):
 127     # The id of this revision:
 128     self.cvs_rev_id = cvs_rev_id
 129     self.rev = rev
 130     self.timestamp = timestamp
 131     self.author = author
 132     self.state = state
 133
 134     # If this is the first revision on a branch, then this is the
 135     # branch_data of that branch; otherwise it is None.
 136     self.parent_branch_data = None
 137
 138     # The revision number of the parent of this revision along the
 139     # same line of development, if any.  For the first revision R on a
 140     # branch, we consider the revision from which R sprouted to be the
 141     # 'parent'.  If this is the root revision in the file's revision
 142     # tree, then this field is None.
 143     #
 144     # Note that this revision can't be determined arithmetically (due
 145     # to cvsadmin -o), which is why this field is necessary.
 146     self.parent = None
 147
 148     # The revision number of the primary child of this revision (the
 149     # child along the same line of development), if any; otherwise,
 150     # None.
 151     self.child = None
 152
 153     # The _BranchData instances of branches that sprout from this
 154     # revision, sorted in ascending order by branch number.  It would
 155     # be inconvenient to initialize it here because we would have to
 156     # scan through all branches known by the _SymbolDataCollector to
 157     # find the ones having us as the parent.  Instead, this
 158     # information is filled in by
 159     # _FileDataCollector._resolve_dependencies() and sorted by
 160     # _FileDataCollector._sort_branches().
 161     self.branches_data = []
 162
 163     # The revision numbers of the first commits on any branches on
 164     # which commits occurred.  This dependency is kept explicitly
 165     # because otherwise a revision-only topological sort would miss
 166     # the dependency that exists via branches_data.
 167     self.branches_revs_data = []
 168
 169     # The _TagData instances of tags that are connected to this
 170     # revision.
 171     self.tags_data = []
 172
 173     # A token that may be set by a RevisionCollector, then used by
 174     # RevisionReader to obtain the text again.
 175     self.revision_reader_token = None
 176
 177   def get_first_on_branch_id(self):
 178     return self.parent_branch_data and self.parent_branch_data.id
 179
 180
 181 class _SymbolData:
 182   """Collection area for information about a symbol in a single CVSFile.
 183
 184   SYMBOL is an instance of Symbol, undifferentiated as a Branch or a
 185   Tag regardless of whether self is a _BranchData or a _TagData."""
 186
 187   def __init__(self, id, symbol):
 188     """Initialize an object for SYMBOL."""
 189
 190     # The unique id that will be used for this particular symbol in
 191     # this particular file.  This same id will be used for the CVSItem
 192     # that is derived from this instance.
 193     self.id = id
 194
 195     # An instance of Symbol.
 196     self.symbol = symbol
 197
 198
 199 class _BranchData(_SymbolData):
 200   """Collection area for information about a Branch in a single CVSFile."""
 201
 202   def __init__(self, id, symbol, branch_number):
 203     _SymbolData.__init__(self, id, symbol)
 204
 205     # The branch number (e.g., '1.5.2') of this branch.
 206     self.branch_number = branch_number
 207
 208     # The revision number of the revision from which this branch
 209     # sprouts (e.g., '1.5').
 210     self.parent = self.branch_number[:self.branch_number.rindex(".")]
 211
 212     # The revision number of the first commit on this branch, if any
 213     # (e.g., '1.5.2.1'); otherwise, None.
 214     self.child = None
 215
 216
 217 class _TagData(_SymbolData):
 218   """Collection area for information about a Tag in a single CVSFile."""
 219
 220   def __init__(self, id, symbol, rev):
 221     _SymbolData.__init__(self, id, symbol)
 222
 223     # The revision number being tagged (e.g., '1.5.2.3').
 224     self.rev = rev
 225
 226
 227 class _SymbolDataCollector(object):
 228   """Collect information about symbols in a single CVSFile."""
 229
 230   def __init__(self, fdc, cvs_file):
 231     self.fdc = fdc
 232     self.cvs_file = cvs_file
 233
 234     self.pdc = self.fdc.pdc
 235     self.collect_data = self.fdc.collect_data
 236
 237     # A list [(name, revision), ...] of symbols defined in the header
 238     # of the file.  The name has already been transformed using the
 239     # symbol transform rules.  If the symbol transform rules indicate
 240     # that the symbol should be ignored, then it is never added to
 241     # this list.  This list is processed then deleted in
 242     # process_symbols().
 243     self._symbol_defs = []
 244
 245     # A set containing the transformed names of symbols in this file
 246     # (used to detect duplicates during processing of unlabeled
 247     # branches):
 248     self._defined_symbols = set()
 249
 250     # Map { branch_number : _BranchData }, where branch_number has an
 251     # odd number of digits.
 252     self.branches_data = { }
 253
 254     # Map { revision : [ tag_data ] }, where revision has an even
 255     # number of digits, and the value is a list of _TagData objects
 256     # for tags that apply to that revision.
 257     self.tags_data = { }
 258
 259   def _add_branch(self, name, branch_number):
 260     """Record that BRANCH_NUMBER is the branch number for branch NAME,
 261     and derive and record the revision from which NAME sprouts.
 262     BRANCH_NUMBER is an RCS branch number with an odd number of
 263     components, for example '1.7.2' (never '1.7.0.2').  Return the
 264     _BranchData instance (which is usually newly-created)."""
 265
 266     branch_data = self.branches_data.get(branch_number)
 267
 268     if branch_data is not None:
 269       logger.warn(
 270           "%s: in '%s':\n"
 271           "   branch '%s' already has name '%s',\n"
 272           "   cannot also have name '%s', ignoring the latter\n"
 273           % (warning_prefix,
 274              self.cvs_file.filename, branch_number,
 275              branch_data.symbol.name, name)
 276           )
 277       return branch_data
 278
 279     symbol = self.pdc.get_symbol(name)
 280     branch_data = _BranchData(
 281         self.collect_data.item_key_generator.gen_id(), symbol, branch_number
 282         )
 283     self.branches_data[branch_number] = branch_data
 284     return branch_data
 285
 286   def _construct_distinct_name(self, name, original_name):
 287     """Construct a distinct symbol name from NAME.
 288
 289     If NAME is distinct, return it.  If it is already used in this
 290     file (as determined from its presence in self._defined_symbols),
 291     construct and return a new name that is not already used."""
 292
 293     if name not in self._defined_symbols:
 294       return name
 295     else:
 296       index = 1
 297       while True:
 298         dup_name = '%s-DUPLICATE-%d' % (name, index,)
 299         if dup_name not in self._defined_symbols:
 300           self.collect_data.record_fatal_error(
 301               "Symbol name '%s' is already used in '%s'.\n"
 302               "The unlabeled branch '%s' must be renamed using "
 303               "--symbol-transform."
 304               % (name, self.cvs_file.filename, original_name,)
 305               )
 306           return dup_name
 307
 308   def _add_unlabeled_branch(self, branch_number):
 309     original_name = "unlabeled-" + branch_number
 310     name = self.transform_symbol(original_name, branch_number)
 311     if name is None:
 312       self.collect_data.record_fatal_error(
 313           "The unlabeled branch '%s' in '%s' contains commits.\n"
 314           "It may not be ignored via a symbol transform.  (Use --exclude "
 315           "instead.)"
 316           % (original_name, self.cvs_file.filename,)
 317           )
 318       # Retain the original name to allow the conversion to continue:
 319       name = original_name
 320
 321     distinct_name = self._construct_distinct_name(name, original_name)
 322     self._defined_symbols.add(distinct_name)
 323     return self._add_branch(distinct_name, branch_number)
 324
 325   def _add_tag(self, name, revision):
 326     """Record that tag NAME refers to the specified REVISION."""
 327
 328     symbol = self.pdc.get_symbol(name)
 329     tag_data = _TagData(
 330         self.collect_data.item_key_generator.gen_id(), symbol, revision
 331         )
 332     self.tags_data.setdefault(revision, []).append(tag_data)
 333     return tag_data
 334
 335   def transform_symbol(self, name, revision):
 336     """Transform a symbol according to the project's symbol transforms.
 337
 338     Transform the symbol with the original name NAME and canonicalized
 339     revision number REVISION.  Return the new symbol name or None if
 340     the symbol should be ignored entirely.
 341
 342     Log the results of the symbol transform if necessary."""
 343
 344     old_name = name
 345     # Apply any user-defined symbol transforms to the symbol name:
 346     name = self.cvs_file.project.transform_symbol(
 347         self.cvs_file, name, revision
 348         )
 349
 350     if name is None:
 351       # Ignore symbol:
 352       self.pdc.log_symbol_transform(old_name, None)
 353       logger.verbose(
 354           "   symbol '%s'=%s ignored in %s"
 355           % (old_name, revision, self.cvs_file.filename,)
 356           )
 357     else:
 358       if name != old_name:
 359         self.pdc.log_symbol_transform(old_name, name)
 360         logger.verbose(
 361             "   symbol '%s'=%s transformed to '%s' in %s"
 362             % (old_name, revision, name, self.cvs_file.filename,)
 363             )
 364
 365     return name
 366
 367   def define_symbol(self, name, revision):
 368     """Record a symbol definition for later processing."""
 369
 370     # Canonicalize the revision number:
 371     revision = _branch_revision_re.sub(r'\1\2', revision)
 372
 373     # Apply any user-defined symbol transforms to the symbol name:
 374     name = self.transform_symbol(name, revision)
 375
 376     if name is not None:
 377       # Verify that the revision number is valid:
 378       if _valid_revision_re.match(revision):
 379         # The revision number is valid; record it for later processing:
 380         self._symbol_defs.append( (name, revision) )
 381       else:
 382         logger.warn(
 383             'In %r:\n'
 384             '    branch %r references invalid revision %s\n'
 385             '    and will be ignored.'
 386             % (self.cvs_file.filename, name, revision,)
 387             )
 388
 389   def _eliminate_trivial_duplicate_defs(self, symbol_defs):
 390     """Iterate through SYMBOL_DEFS, Removing identical duplicate definitions.
 391
 392     Duplicate definitions of symbol names have been seen in the wild,
 393     and they can also happen when --symbol-transform is used.  If a
 394     symbol is defined to the same revision number repeatedly, then
 395     ignore all but the last definition."""
 396
 397     # Make a copy, since we have to iterate through the definitions
 398     # twice:
 399     symbol_defs = list(symbol_defs)
 400
 401     # A map { (name, revision) : [index,...] } of the indexes where
 402     # symbol definitions name=revision were found:
 403     known_definitions = {}
 404     for (i, symbol_def) in enumerate(symbol_defs):
 405       known_definitions.setdefault(symbol_def, []).append(i)
 406
 407     # A set of the indexes of entries that have to be removed from
 408     # symbol_defs:
 409     dup_indexes = set()
 410     for ((name, revision), indexes) in known_definitions.iteritems():
 411       if len(indexes) > 1:
 412         logger.verbose(
 413             "in %r:\n"
 414             "   symbol %s:%s defined multiple times; ignoring duplicates\n"
 415             % (self.cvs_file.filename, name, revision,)
 416             )
 417         dup_indexes.update(indexes[:-1])
 418
 419     for (i, symbol_def) in enumerate(symbol_defs):
 420       if i not in dup_indexes:
 421         yield symbol_def
 422
 423   def _process_duplicate_defs(self, symbol_defs):
 424     """Iterate through SYMBOL_DEFS, processing duplicate names.
 425
 426     Duplicate definitions of symbol names have been seen in the wild,
 427     and they can also happen when --symbol-transform is used.  If a
 428     symbol is defined multiple times, then it is a fatal error.  This
 429     method should be called after _eliminate_trivial_duplicate_defs()."""
 430
 431     # Make a copy, since we have to access multiple times:
 432     symbol_defs = list(symbol_defs)
 433
 434     # A map {name : [index,...]} mapping the names of symbols to a
 435     # list of their definitions' indexes in symbol_defs:
 436     known_symbols = {}
 437     for (i, (name, revision)) in enumerate(symbol_defs):
 438       known_symbols.setdefault(name, []).append(i)
 439
 440     known_symbols = known_symbols.items()
 441     known_symbols.sort()
 442     dup_indexes = set()
 443     for (name, indexes) in known_symbols:
 444       if len(indexes) > 1:
 445         # This symbol was defined multiple times.
 446         self.collect_data.record_fatal_error(
 447             "Multiple definitions of the symbol '%s' in '%s': %s" % (
 448                 name, self.cvs_file.filename,
 449                 ' '.join([symbol_defs[i][1] for i in indexes]),
 450                 )
 451             )
 452         # Ignore all but the last definition for now, to allow the
 453         # conversion to proceed:
 454         dup_indexes.update(indexes[:-1])
 455
 456     for (i, symbol_def) in enumerate(symbol_defs):
 457       if i not in dup_indexes:
 458         yield symbol_def
 459
 460   def _process_symbol(self, name, revision):
 461     """Process a symbol called NAME, which is associated with REVISON.
 462
 463     REVISION is a canonical revision number with zeros removed, for
 464     example: '1.7', '1.7.2', or '1.1.1' or '1.1.1.1'.  NAME is a
 465     transformed branch or tag name."""
 466
 467     # Add symbol to our records:
 468     if is_branch_revision_number(revision):
 469       self._add_branch(name, revision)
 470     else:
 471       self._add_tag(name, revision)
 472
 473   def process_symbols(self):
 474     """Process the symbol definitions from SELF._symbol_defs."""
 475
 476     symbol_defs = self._symbol_defs
 477     del self._symbol_defs
 478
 479     symbol_defs = self._eliminate_trivial_duplicate_defs(symbol_defs)
 480     symbol_defs = self._process_duplicate_defs(symbol_defs)
 481
 482     for (name, revision) in symbol_defs:
 483       self._defined_symbols.add(name)
 484       self._process_symbol(name, revision)
 485
 486   @staticmethod
 487   def rev_to_branch_number(revision):
 488     """Return the branch_number of the branch on which REVISION lies.
 489
 490     REVISION is a branch revision number with an even number of
 491     components; for example '1.7.2.1' (never '1.7.2' nor '1.7.0.2').
 492     The return value is the branch number (for example, '1.7.2').
 493     Return none iff REVISION is a trunk revision such as '1.2'."""
 494
 495     if is_trunk_revision(revision):
 496       return None
 497     return revision[:revision.rindex(".")]
 498
 499   def rev_to_branch_data(self, revision):
 500     """Return the branch_data of the branch on which REVISION lies.
 501
 502     REVISION must be a branch revision number with an even number of
 503     components; for example '1.7.2.1' (never '1.7.2' nor '1.7.0.2').
 504     Raise KeyError iff REVISION is unknown."""
 505
 506     assert not is_trunk_revision(revision)
 507
 508     return self.branches_data[self.rev_to_branch_number(revision)]
 509
 510   def rev_to_lod(self, revision):
 511     """Return the line of development on which REVISION lies.
 512
 513     REVISION must be a revision number with an even number of
 514     components.  Raise KeyError iff REVISION is unknown."""
 515
 516     if is_trunk_revision(revision):
 517       return self.pdc.trunk
 518     else:
 519       return self.rev_to_branch_data(revision).symbol
 520
 521
 522 class _FileDataCollector(cvs2svn_rcsparse.Sink):
 523   """Class responsible for collecting RCS data for a particular file.
 524
 525   Any collected data that need to be remembered are stored into the
 526   referenced CollectData instance."""
 527
 528   def __init__(self, pdc, cvs_file):
 529     """Create an object that is prepared to receive data for CVS_FILE.
 530     CVS_FILE is a CVSFile instance.  COLLECT_DATA is used to store the
 531     information collected about the file."""
 532
 533     self.pdc = pdc
 534     self.cvs_file = cvs_file
 535
 536     self.collect_data = self.pdc.collect_data
 537     self.project = self.cvs_file.project
 538
 539     # A place to store information about the symbols in this file:
 540     self.sdc = _SymbolDataCollector(self, self.cvs_file)
 541
 542     # { revision : _RevisionData instance }
 543     self._rev_data = { }
 544
 545     # Lists [ (parent, child) ] of revision number pairs indicating
 546     # that revision child depends on revision parent along the main
 547     # line of development.
 548     self._primary_dependencies = []
 549
 550     # If set, this is an RCS branch number -- rcsparse calls this the
 551     # "principal branch", but CVS and RCS refer to it as the "default
 552     # branch", so that's what we call it, even though the rcsparse API
 553     # setter method is still 'set_principal_branch'.
 554     self.default_branch = None
 555
 556     # True iff revision 1.1 of the file appears to have been imported
 557     # (as opposed to added normally).
 558     self._file_imported = False
 559
 560   def _get_rev_id(self, revision):
 561     if revision is None:
 562       return None
 563     return self._rev_data[revision].cvs_rev_id
 564
 565   def set_principal_branch(self, branch):
 566     """This is a callback method declared in Sink."""
 567
 568     if branch.find('.') == -1:
 569       # This just sets the default branch to trunk.  Normally this
 570       # shouldn't occur, but it has been seen in at least one CVS
 571       # repository.  Just ignore it.
 572       pass
 573     else:
 574       self.default_branch = branch
 575
 576   def define_tag(self, name, revision):
 577     """Remember the symbol name and revision, but don't process them yet.
 578
 579     This is a callback method declared in Sink."""
 580
 581     self.sdc.define_symbol(name, revision)
 582
 583   def set_expansion(self, mode):
 584     """This is a callback method declared in Sink."""
 585
 586     self.cvs_file.mode = mode
 587
 588   def admin_completed(self):
 589     """This is a callback method declared in Sink."""
 590
 591     self.sdc.process_symbols()
 592
 593   def define_revision(self, revision, timestamp, author, state,
 594                       branches, next):
 595     """This is a callback method declared in Sink."""
 596
 597     for branch in branches:
 598       try:
 599         branch_data = self.sdc.rev_to_branch_data(branch)
 600       except KeyError:
 601         # Normally we learn about the branches from the branch names
 602         # and numbers parsed from the symbolic name header.  But this
 603         # must have been an unlabeled branch that slipped through the
 604         # net.  Generate a name for it and create a _BranchData record
 605         # for it now.
 606         branch_data = self.sdc._add_unlabeled_branch(
 607             self.sdc.rev_to_branch_number(branch))
 608
 609       assert branch_data.child is None
 610       branch_data.child = branch
 611
 612     if revision in self._rev_data:
 613       # This revision has already been seen.
 614       logger.error('File %r contains duplicate definitions of revision %s.'
 615                   % (self.cvs_file.filename, revision,))
 616       raise RuntimeError
 617
 618     # Record basic information about the revision:
 619     rev_data = _RevisionData(
 620         self.collect_data.item_key_generator.gen_id(),
 621         revision, int(timestamp), author, state)
 622     self._rev_data[revision] = rev_data
 623
 624     # When on trunk, the RCS 'next' revision number points to what
 625     # humans might consider to be the 'previous' revision number.  For
 626     # example, 1.3's RCS 'next' is 1.2.
 627     #
 628     # However, on a branch, the RCS 'next' revision number really does
 629     # point to what humans would consider to be the 'next' revision
 630     # number.  For example, 1.1.2.1's RCS 'next' would be 1.1.2.2.
 631     #
 632     # In other words, in RCS, 'next' always means "where to find the next
 633     # deltatext that you need this revision to retrieve.
 634     #
 635     # That said, we don't *want* RCS's behavior here, so we determine
 636     # whether we're on trunk or a branch and set the dependencies
 637     # accordingly.
 638     if next:
 639       if is_trunk_revision(revision):
 640         self._primary_dependencies.append( (next, revision,) )
 641       else:
 642         self._primary_dependencies.append( (revision, next,) )
 643
 644   def tree_completed(self):
 645     """The revision tree has been parsed.
 646
 647     Analyze it for consistency and connect some loose ends.
 648
 649     This is a callback method declared in Sink."""
 650
 651     self._resolve_primary_dependencies()
 652     self._resolve_branch_dependencies()
 653     self._sort_branches()
 654     self._resolve_tag_dependencies()
 655
 656     # Compute the preliminary CVSFileItems for this file:
 657     cvs_items = []
 658     cvs_items.extend(self._get_cvs_revisions())
 659     cvs_items.extend(self._get_cvs_branches())
 660     cvs_items.extend(self._get_cvs_tags())
 661     self._cvs_file_items = CVSFileItems(
 662         self.cvs_file, self.pdc.trunk, cvs_items
 663         )
 664
 665     self._cvs_file_items.check_link_consistency()
 666
 667   def _resolve_primary_dependencies(self):
 668     """Resolve the dependencies listed in self._primary_dependencies."""
 669
 670     for (parent, child,) in self._primary_dependencies:
 671       parent_data = self._rev_data[parent]
 672       assert parent_data.child is None
 673       parent_data.child = child
 674
 675       child_data = self._rev_data[child]
 676       assert child_data.parent is None
 677       child_data.parent = parent
 678
 679   def _resolve_branch_dependencies(self):
 680     """Resolve dependencies involving branches."""
 681
 682     for branch_data in self.sdc.branches_data.values():
 683       # The branch_data's parent has the branch as a child regardless
 684       # of whether the branch had any subsequent commits:
 685       try:
 686         parent_data = self._rev_data[branch_data.parent]
 687       except KeyError:
 688         logger.warn(
 689             'In %r:\n'
 690             '    branch %r references non-existing revision %s\n'
 691             '    and will be ignored.'
 692             % (self.cvs_file.filename, branch_data.symbol.name,
 693                branch_data.parent,))
 694         del self.sdc.branches_data[branch_data.branch_number]
 695       else:
 696         parent_data.branches_data.append(branch_data)
 697
 698         # If the branch has a child (i.e., something was committed on
 699         # the branch), then we store a reference to the branch_data
 700         # there, define the child's parent to be the branch's parent,
 701         # and list the child in the branch parent's branches_revs_data:
 702         if branch_data.child is not None:
 703           child_data = self._rev_data[branch_data.child]
 704           assert child_data.parent_branch_data is None
 705           child_data.parent_branch_data = branch_data
 706           assert child_data.parent is None
 707           child_data.parent = branch_data.parent
 708           parent_data.branches_revs_data.append(branch_data.child)
 709
 710   def _sort_branches(self):
 711     """Sort the branches sprouting from each revision in creation order.
 712
 713     Creation order is taken to be the reverse of the order that they
 714     are listed in the symbols part of the RCS file.  (If a branch is
 715     created then deleted, a later branch can be assigned the recycled
 716     branch number; therefore branch numbers are not an indication of
 717     creation order.)"""
 718
 719     for rev_data in self._rev_data.values():
 720       rev_data.branches_data.sort(lambda a, b: - cmp(a.id, b.id))
 721
 722   def _resolve_tag_dependencies(self):
 723     """Resolve dependencies involving tags."""
 724
 725     for (rev, tag_data_list) in self.sdc.tags_data.items():
 726       try:
 727         parent_data = self._rev_data[rev]
 728       except KeyError:
 729         logger.warn(
 730             'In %r:\n'
 731             '    the following tag(s) reference non-existing revision %s\n'
 732             '    and will be ignored:\n'
 733             '    %s' % (
 734                 self.cvs_file.filename, rev,
 735                 ', '.join([repr(tag_data.symbol.name)
 736                            for tag_data in tag_data_list]),))
 737         del self.sdc.tags_data[rev]
 738       else:
 739         for tag_data in tag_data_list:
 740           assert tag_data.rev == rev
 741           # The tag_data's rev has the tag as a child:
 742           parent_data.tags_data.append(tag_data)
 743
 744   def _get_cvs_branches(self):
 745     """Generate the CVSBranches present in this file."""
 746
 747     for branch_data in self.sdc.branches_data.values():
 748       yield CVSBranch(
 749           branch_data.id, self.cvs_file, branch_data.symbol,
 750           branch_data.branch_number,
 751           self.sdc.rev_to_lod(branch_data.parent),
 752           self._get_rev_id(branch_data.parent),
 753           self._get_rev_id(branch_data.child),
 754           None,
 755           )
 756
 757   def _get_cvs_tags(self):
 758     """Generate the CVSTags present in this file."""
 759
 760     for tags_data in self.sdc.tags_data.values():
 761       for tag_data in tags_data:
 762         yield CVSTag(
 763             tag_data.id, self.cvs_file, tag_data.symbol,
 764             self.sdc.rev_to_lod(tag_data.rev),
 765             self._get_rev_id(tag_data.rev),
 766             None,
 767             )
 768
 769   def set_description(self, description):
 770     """This is a callback method declared in Sink."""
 771
 772     self.cvs_file.description = description
 773     self.cvs_file.determine_file_properties(Ctx().file_property_setters)
 774
 775   def set_revision_info(self, revision, log, text):
 776     """This is a callback method declared in Sink."""
 777
 778     rev_data = self._rev_data[revision]
 779     cvs_rev = self._cvs_file_items[rev_data.cvs_rev_id]
 780
 781     if cvs_rev.metadata_id is not None:
 782       # Users have reported problems with repositories in which the
 783       # deltatext block for revision 1.1 appears twice.  It is not
 784       # known whether this results from a CVS/RCS bug, or from botched
 785       # hand-editing of the repository.  In any case, empirically, cvs
 786       # and rcs both use the first version when checking out data, so
 787       # that's what we will do.  (For the record: "cvs log" fails on
 788       # such a file; "rlog" prints the log message from the first
 789       # block and ignores the second one.)
 790       logger.warn(
 791           "%s: in '%s':\n"
 792           "   Deltatext block for revision %s appeared twice;\n"
 793           "   ignoring the second occurrence.\n"
 794           % (warning_prefix, self.cvs_file.filename, revision,)
 795           )
 796       return
 797
 798     if is_trunk_revision(revision):
 799       branch_name = None
 800     else:
 801       branch_name = self.sdc.rev_to_branch_data(revision).symbol.name
 802
 803     cvs_rev.metadata_id = self.collect_data.metadata_logger.store(
 804         self.project, branch_name, rev_data.author, log
 805         )
 806     cvs_rev.deltatext_exists = bool(text)
 807
 808     # If this is revision 1.1, determine whether the file appears to
 809     # have been created via 'cvs add' instead of 'cvs import'.  The
 810     # test is that the log message CVS uses for 1.1 in imports is
 811     # "Initial revision\n" with no period.  (This fact helps determine
 812     # whether this file might have had a default branch in the past.)
 813     if revision == '1.1':
 814       self._file_imported = (log == 'Initial revision\n')
 815
 816   def parse_completed(self):
 817     """Finish the processing of this file.
 818
 819     This is a callback method declared in Sink."""
 820
 821     # Make sure that there was an info section for each revision:
 822     for cvs_item in self._cvs_file_items.values():
 823       if isinstance(cvs_item, CVSRevision) and cvs_item.metadata_id is None:
 824         self.collect_data.record_fatal_error(
 825             '%r has no deltatext section for revision %s'
 826             % (self.cvs_file.filename, cvs_item.rev,)
 827             )
 828
 829   def _determine_operation(self, rev_data):
 830     prev_rev_data = self._rev_data.get(rev_data.parent)
 831     return cvs_revision_type_map[(
 832         rev_data.state != 'dead',
 833         prev_rev_data is not None and prev_rev_data.state != 'dead',
 834         )]
 835
 836   def _get_cvs_revisions(self):
 837     """Generate the CVSRevisions present in this file."""
 838
 839     for rev_data in self._rev_data.itervalues():
 840       yield self._get_cvs_revision(rev_data)
 841
 842   def _get_cvs_revision(self, rev_data):
 843     """Create and return a CVSRevision for REV_DATA."""
 844
 845     branch_ids = [
 846         branch_data.id
 847         for branch_data in rev_data.branches_data
 848         ]
 849
 850     branch_commit_ids = [
 851         self._get_rev_id(rev)
 852         for rev in rev_data.branches_revs_data
 853         ]
 854
 855     tag_ids = [
 856         tag_data.id
 857         for tag_data in rev_data.tags_data
 858         ]
 859
 860     revision_type = self._determine_operation(rev_data)
 861
 862     return revision_type(
 863         self._get_rev_id(rev_data.rev), self.cvs_file,
 864         rev_data.timestamp, None,
 865         self._get_rev_id(rev_data.parent),
 866         self._get_rev_id(rev_data.child),
 867         rev_data.rev,
 868         True,
 869         self.sdc.rev_to_lod(rev_data.rev),
 870         rev_data.get_first_on_branch_id(),
 871         False, None, None,
 872         tag_ids, branch_ids, branch_commit_ids,
 873         rev_data.revision_reader_token
 874         )
 875
 876   def get_cvs_file_items(self):
 877     """Finish up and return a CVSFileItems instance for this file.
 878
 879     This method must only be called once."""
 880
 881     self._process_ntdbrs()
 882
 883     # Break a circular reference loop, allowing the memory for self
 884     # and sdc to be freed.
 885     del self.sdc
 886
 887     return self._cvs_file_items
 888
 889   def _process_ntdbrs(self):
 890     """Fix up any non-trunk default branch revisions (if present).
 891
 892     If a non-trunk default branch is determined to have existed, yield
 893     the _RevisionData.ids for all revisions that were once non-trunk
 894     default revisions, in dependency order.
 895
 896     There are two cases to handle:
 897
 898     One case is simple.  The RCS file lists a default branch
 899     explicitly in its header, such as '1.1.1'.  In this case, we know
 900     that every revision on the vendor branch is to be treated as head
 901     of trunk at that point in time.
 902
 903     But there's also a degenerate case.  The RCS file does not
 904     currently have a default branch, yet we can deduce that for some
 905     period in the past it probably *did* have one.  For example, the
 906     file has vendor revisions 1.1.1.1 -> 1.1.1.96, all of which are
 907     dated before 1.2, and then it has 1.1.1.97 -> 1.1.1.100 dated
 908     after 1.2.  In this case, we should record 1.1.1.96 as the last
 909     vendor revision to have been the head of the default branch.
 910
 911     If any non-trunk default branch revisions are found:
 912
 913     - Set their ntdbr members to True.
 914
 915     - Connect the last one with revision 1.2.
 916
 917     - Remove revision 1.1 if it is not needed.
 918
 919     """
 920
 921     try:
 922       if self.default_branch:
 923         vendor_cvs_branch_id = self.sdc.branches_data[self.default_branch].id
 924         vendor_lod_items = self._cvs_file_items.get_lod_items(
 925             self._cvs_file_items[vendor_cvs_branch_id]
 926             )
 927         if not self._cvs_file_items.process_live_ntdb(vendor_lod_items):
 928           return
 929       elif self._file_imported:
 930         vendor_branch_data = self.sdc.branches_data.get('1.1.1')
 931         if vendor_branch_data is None:
 932           return
 933         else:
 934           vendor_lod_items = self._cvs_file_items.get_lod_items(
 935               self._cvs_file_items[vendor_branch_data.id]
 936               )
 937           if not self._cvs_file_items.process_historical_ntdb(
 938                 vendor_lod_items
 939                 ):
 940             return
 941       else:
 942         return
 943     except VendorBranchError, e:
 944       self.collect_data.record_fatal_error(str(e))
 945       return
 946
 947     if self._file_imported:
 948       self._cvs_file_items.imported_remove_1_1(vendor_lod_items)
 949
 950     self._cvs_file_items.check_link_consistency()
 951
 952
 953 class _ProjectDataCollector:
 954   def __init__(self, collect_data, project):
 955     self.collect_data = collect_data
 956     self.project = project
 957     self.num_files = 0
 958
 959     # The Trunk LineOfDevelopment object for this project:
 960     self.trunk = Trunk(
 961         self.collect_data.symbol_key_generator.gen_id(), self.project
 962         )
 963     self.project.trunk_id = self.trunk.id
 964
 965     # This causes a record for self.trunk to spring into existence:
 966     self.collect_data.register_trunk(self.trunk)
 967
 968     # A map { name -> Symbol } for all known symbols in this project.
 969     # The symbols listed here are undifferentiated into Branches and
 970     # Tags because the same name might appear as a branch in one file
 971     # and a tag in another.
 972     self.symbols = {}
 973
 974     # A map { (old_name, new_name) : count } indicating how many files
 975     # were affected by each each symbol name transformation:
 976     self.symbol_transform_counts = {}
 977
 978   def get_symbol(self, name):
 979     """Return the Symbol object for the symbol named NAME in this project.
 980
 981     If such a symbol does not yet exist, allocate a new symbol_id,
 982     create a Symbol instance, store it in self.symbols, and return it."""
 983
 984     symbol = self.symbols.get(name)
 985     if symbol is None:
 986       symbol = Symbol(
 987           self.collect_data.symbol_key_generator.gen_id(),
 988           self.project, name)
 989       self.symbols[name] = symbol
 990     return symbol
 991
 992   def log_symbol_transform(self, old_name, new_name):
 993     """Record that OLD_NAME was transformed to NEW_NAME in one file.
 994
 995     This information is used to generated a statistical summary of
 996     symbol transforms."""
 997
 998     try:
 999       self.symbol_transform_counts[old_name, new_name] += 1
1000     except KeyError:
1001       self.symbol_transform_counts[old_name, new_name] = 1
1002
1003   def summarize_symbol_transforms(self):
1004     if self.symbol_transform_counts and logger.is_on(logger.NORMAL):
1005       logger.normal('Summary of symbol transforms:')
1006       transforms = self.symbol_transform_counts.items()
1007       transforms.sort()
1008       for ((old_name, new_name), count) in transforms:
1009         if new_name is None:
1010           logger.normal('    "%s" ignored in %d files' % (old_name, count,))
1011         else:
1012           logger.normal(
1013               '    "%s" transformed to "%s" in %d files'
1014               % (old_name, new_name, count,)
1015               )
1016
1017   def process_file(self, cvs_file):
1018     logger.normal(cvs_file.filename)
1019     fdc = _FileDataCollector(self, cvs_file)
1020     try:
1021       cvs2svn_rcsparse.parse(open(cvs_file.filename, 'rb'), fdc)
1022     except (cvs2svn_rcsparse.common.RCSParseError, ValueError, RuntimeError):
1023       self.collect_data.record_fatal_error(
1024           "%r is not a valid ,v file" % (cvs_file.filename,)
1025           )
1026       # Abort the processing of this file, but let the pass continue
1027       # with other files:
1028       return
1029     except:
1030       logger.warn("Exception occurred while parsing %s" % cvs_file.filename)
1031       raise
1032     else:
1033       self.num_files += 1
1034
1035     return fdc.get_cvs_file_items()
1036
1037
1038 class CollectData:
1039   """Repository for data collected by parsing the CVS repository files.
1040
1041   This class manages the databases into which information collected
1042   from the CVS repository is stored.  The data are stored into this
1043   class by _FileDataCollector instances, one of which is created for
1044   each file to be parsed."""
1045
1046   def __init__(self, stats_keeper):
1047     self._cvs_item_store = NewCVSItemStore(
1048         artifact_manager.get_temp_file(config.CVS_ITEMS_STORE))
1049     self.metadata_db = MetadataDatabase(
1050         artifact_manager.get_temp_file(config.METADATA_STORE),
1051         artifact_manager.get_temp_file(config.METADATA_INDEX_TABLE),
1052         DB_OPEN_NEW,
1053         )
1054     self.metadata_logger = MetadataLogger(self.metadata_db)
1055     self.fatal_errors = []
1056     self.num_files = 0
1057     self.symbol_stats = SymbolStatisticsCollector()
1058     self.stats_keeper = stats_keeper
1059
1060     # Key generator for CVSFiles:
1061     self.file_key_generator = KeyGenerator()
1062
1063     # Key generator for CVSItems:
1064     self.item_key_generator = KeyGenerator()
1065
1066     # Key generator for Symbols:
1067     self.symbol_key_generator = KeyGenerator()
1068
1069   def record_fatal_error(self, err):
1070     """Record that fatal error ERR was found.
1071
1072     ERR is a string (without trailing newline) describing the error.
1073     Output the error to stderr immediately, and record a copy to be
1074     output again in a summary at the end of CollectRevsPass."""
1075
1076     err = '%s: %s' % (error_prefix, err,)
1077     logger.error(err + '\n')
1078     self.fatal_errors.append(err)
1079
1080   def add_cvs_directory(self, cvs_directory):
1081     """Record CVS_DIRECTORY."""
1082
1083     Ctx()._cvs_path_db.log_path(cvs_directory)
1084
1085   def add_cvs_file_items(self, cvs_file_items):
1086     """Record the information from CVS_FILE_ITEMS.
1087
1088     Store the CVSFile to _cvs_path_db under its persistent id, store
1089     the CVSItems, and record the CVSItems to self.stats_keeper."""
1090
1091     Ctx()._cvs_path_db.log_path(cvs_file_items.cvs_file)
1092     self._cvs_item_store.add(cvs_file_items)
1093
1094     self.stats_keeper.record_cvs_file(cvs_file_items.cvs_file)
1095     for cvs_item in cvs_file_items.values():
1096       self.stats_keeper.record_cvs_item(cvs_item)
1097
1098   def register_trunk(self, trunk):
1099     """Create a symbol statistics record for the specified trunk LOD."""
1100
1101     # This causes a record to spring into existence:
1102     self.symbol_stats[trunk]
1103
1104   def _process_cvs_file_items(self, cvs_file_items):
1105     """Process the CVSFileItems from one CVSFile."""
1106
1107     # Remove an initial delete on trunk if it is not needed:
1108     cvs_file_items.remove_unneeded_initial_trunk_delete(self.metadata_db)
1109
1110     # Remove initial branch deletes that are not needed:
1111     cvs_file_items.remove_initial_branch_deletes(self.metadata_db)
1112
1113     # If this is a --trunk-only conversion, discard all branches and
1114     # tags, then draft any non-trunk default branch revisions to
1115     # trunk:
1116     if Ctx().trunk_only:
1117       cvs_file_items.exclude_non_trunk()
1118
1119     cvs_file_items.check_link_consistency()
1120
1121     self.add_cvs_file_items(cvs_file_items)
1122     self.symbol_stats.register(cvs_file_items)
1123
1124   def process_project(self, project):
1125     Ctx()._projects[project.id] = project
1126
1127     pdc = _ProjectDataCollector(self, project)
1128
1129     found_rcs_file = False
1130     for cvs_path in walk_repository(
1131           project, self.file_key_generator, self.record_fatal_error
1132           ):
1133       if isinstance(cvs_path, CVSDirectory):
1134         self.add_cvs_directory(cvs_path)
1135       else:
1136         cvs_file_items = pdc.process_file(cvs_path)
1137         self._process_cvs_file_items(cvs_file_items)
1138         found_rcs_file = True
1139
1140     if not found_rcs_file:
1141       self.record_fatal_error(
1142           'No RCS files found under %r!\n'
1143           'Are you absolutely certain you are pointing cvs2svn\n'
1144           'at a CVS repository?\n'
1145           % (project.project_cvs_repos_path,)
1146           )
1147
1148     pdc.summarize_symbol_transforms()
1149
1150     self.num_files += pdc.num_files
1151     logger.verbose('Processed', self.num_files, 'files')
1152
1153   def _register_empty_subdirectories(self):
1154     """Set the CVSDirectory.empty_subdirectory_id members."""
1155
1156     directories = set(
1157         path
1158         for path in Ctx()._cvs_path_db.itervalues()
1159         if isinstance(path, CVSDirectory)
1160         )
1161     for path in Ctx()._cvs_path_db.itervalues():
1162       if isinstance(path, CVSFile):
1163         directory = path.parent_directory
1164         while directory is not None and directory in directories:
1165           directories.remove(directory)
1166           directory = directory.parent_directory
1167     for directory in directories:
1168       if directory.parent_directory is not None:
1169         directory.parent_directory.empty_subdirectory_ids.append(directory.id)
1170
1171   def _set_cvs_path_ordinals(self):
1172     cvs_files = list(Ctx()._cvs_path_db.itervalues())
1173     cvs_files.sort(CVSPath.slow_compare)
1174     for (i, cvs_file) in enumerate(cvs_files):
1175       cvs_file.ordinal = i
1176
1177   def close(self):
1178     """Close the data structures associated with this instance.
1179
1180     Return a list of fatal errors encountered while processing input.
1181     Each list entry is a string describing one fatal error."""
1182
1183     self.symbol_stats.purge_ghost_symbols()
1184     self.symbol_stats.close()
1185     self.symbol_stats = None
1186     self.metadata_logger = None
1187     self.metadata_db.close()
1188     self.metadata_db = None
1189     self._cvs_item_store.close()
1190     self._cvs_item_store = None
1191     self._register_empty_subdirectories()
1192     self._set_cvs_path_ordinals()
1193     retval = self.fatal_errors
1194     self.fatal_errors = None
1195     return retval
1196
1197