cvs2svn_lib/collect_data.py

   1 # (Be in -*- python -*- mode.)
   2 #
   3 # ====================================================================
   4 # Copyright (c) 2000-2009 CollabNet.  All rights reserved.
   5 #
   6 # This software is licensed as described in the file COPYING, which
   7 # you should have received as part of this distribution.  The terms
   8 # are also available at http://subversion.tigris.org/license-1.html.
   9 # If newer versions of this license are posted there, you may use a
  10 # newer version instead, at your option.
  11 #
  12 # This software consists of voluntary contributions made by many
  13 # individuals.  For exact contribution history, see the revision
  14 # history and logs, available at http://cvs2svn.tigris.org/.
  15 # ====================================================================
  16
  17 """Data collection classes.
  18
  19 This module contains the code used to collect data from the CVS
  20 repository.  It parses *,v files, recording all useful information
  21 except for the actual file contents.
  22
  23 As a *,v file is parsed, the information pertaining to the file is
  24 accumulated in memory, mostly in _RevisionData, _BranchData, and
  25 _TagData objects.  When parsing is complete, a final pass is made over
  26 the data to create some final dependency links, collect statistics,
  27 etc., then the _*Data objects are converted into CVSItem objects
  28 (CVSRevision, CVSBranch, and CVSTag respectively) and the CVSItems are
  29 dumped into databases.
  30
  31 During the data collection, persistent unique ids are allocated to
  32 many types of objects: CVSFile, Symbol, and CVSItems.  CVSItems are a
  33 special case.  CVSItem ids are unique across all CVSItem types, and
  34 the ids are carried over from the corresponding data collection
  35 objects:
  36
  37     _RevisionData -> CVSRevision
  38
  39     _BranchData -> CVSBranch
  40
  41     _TagData -> CVSTag
  42
  43 In a later pass it is possible to convert tags <-> branches.  But even
  44 if this occurs, the new branch or tag uses the same id as the old tag
  45 or branch.
  46
  47 """
  48
  49
  50 import os
  51 import stat
  52 import re
  53
  54 from cvs2svn_lib import config
  55 from cvs2svn_lib.common import DB_OPEN_NEW
  56 from cvs2svn_lib.common import warning_prefix
  57 from cvs2svn_lib.common import error_prefix
  58 from cvs2svn_lib.common import is_trunk_revision
  59 from cvs2svn_lib.common import is_branch_revision_number
  60 from cvs2svn_lib.log import Log
  61 from cvs2svn_lib.context import Ctx
  62 from cvs2svn_lib.artifact_manager import artifact_manager
  63 from cvs2svn_lib.cvs_path import CVSPath
  64 from cvs2svn_lib.cvs_path import CVSFile
  65 from cvs2svn_lib.cvs_path import CVSDirectory
  66 from cvs2svn_lib.symbol import Symbol
  67 from cvs2svn_lib.symbol import Trunk
  68 from cvs2svn_lib.cvs_item import CVSRevision
  69 from cvs2svn_lib.cvs_item import CVSBranch
  70 from cvs2svn_lib.cvs_item import CVSTag
  71 from cvs2svn_lib.cvs_item import cvs_revision_type_map
  72 from cvs2svn_lib.cvs_file_items import VendorBranchError
  73 from cvs2svn_lib.cvs_file_items import CVSFileItems
  74 from cvs2svn_lib.key_generator import KeyGenerator
  75 from cvs2svn_lib.cvs_item_database import NewCVSItemStore
  76 from cvs2svn_lib.symbol_statistics import SymbolStatisticsCollector
  77 from cvs2svn_lib.metadata_database import MetadataDatabase
  78 from cvs2svn_lib.metadata_database import MetadataLogger
  79 from cvs2svn_lib.repository_walker import walk_repository
  80
  81 import cvs2svn_rcsparse
  82
  83
  84 # A regular expression defining "valid" revision numbers (used to
  85 # check that symbol definitions are reasonable).
  86 _valid_revision_re = re.compile(r'''
  87     ^
  88     (?:\d+\.)+          # Digit groups with trailing dots
  89     \d+                 # And the last digit group.
  90     $
  91     ''', re.VERBOSE)
  92
  93 _branch_revision_re = re.compile(r'''
  94     ^
  95     ((?:\d+\.\d+\.)+)   # A nonzero even number of digit groups w/trailing dot
  96     (?:0\.)?            # CVS sticks an extra 0 here; RCS does not
  97     (\d+)               # And the last digit group
  98     $
  99     ''', re.VERBOSE)
 100
 101
 102 def is_same_line_of_development(rev1, rev2):
 103   """Return True if rev1 and rev2 are on the same line of
 104   development (i.e., both on trunk, or both on the same branch);
 105   return False otherwise.  Either rev1 or rev2 can be None, in
 106   which case automatically return False."""
 107
 108   if rev1 is None or rev2 is None:
 109     return False
 110   if is_trunk_revision(rev1) and is_trunk_revision(rev2):
 111     # Trunk revisions have to be handled specially because the main
 112     # trunk version number can be changed; e.g., from 1 to 2.
 113     return True
 114   if rev1[0:rev1.rfind('.')] == rev2[0:rev2.rfind('.')]:
 115     return True
 116   return False
 117
 118
 119 class _RevisionData:
 120   """We track the state of each revision so that in set_revision_info,
 121   we can determine if our op is an add/change/delete.  We can do this
 122   because in set_revision_info, we'll have all of the _RevisionData
 123   for a file at our fingertips, and we need to examine the state of
 124   our prev_rev to determine if we're an add or a change.  Without the
 125   state of the prev_rev, we are unable to distinguish between an add
 126   and a change."""
 127
 128   def __init__(self, cvs_rev_id, rev, timestamp, author, state):
 129     # The id of this revision:
 130     self.cvs_rev_id = cvs_rev_id
 131     self.rev = rev
 132     self.timestamp = timestamp
 133     self.author = author
 134     self.state = state
 135
 136     # If this is the first revision on a branch, then this is the
 137     # branch_data of that branch; otherwise it is None.
 138     self.parent_branch_data = None
 139
 140     # The revision number of the parent of this revision along the
 141     # same line of development, if any.  For the first revision R on a
 142     # branch, we consider the revision from which R sprouted to be the
 143     # 'parent'.  If this is the root revision in the file's revision
 144     # tree, then this field is None.
 145     #
 146     # Note that this revision can't be determined arithmetically (due
 147     # to cvsadmin -o), which is why this field is necessary.
 148     self.parent = None
 149
 150     # The revision number of the primary child of this revision (the
 151     # child along the same line of development), if any; otherwise,
 152     # None.
 153     self.child = None
 154
 155     # The _BranchData instances of branches that sprout from this
 156     # revision, sorted in ascending order by branch number.  It would
 157     # be inconvenient to initialize it here because we would have to
 158     # scan through all branches known by the _SymbolDataCollector to
 159     # find the ones having us as the parent.  Instead, this
 160     # information is filled in by
 161     # _FileDataCollector._resolve_dependencies() and sorted by
 162     # _FileDataCollector._sort_branches().
 163     self.branches_data = []
 164
 165     # The revision numbers of the first commits on any branches on
 166     # which commits occurred.  This dependency is kept explicitly
 167     # because otherwise a revision-only topological sort would miss
 168     # the dependency that exists via branches_data.
 169     self.branches_revs_data = []
 170
 171     # The _TagData instances of tags that are connected to this
 172     # revision.
 173     self.tags_data = []
 174
 175     # A token that may be set by a RevisionCollector, then used by
 176     # RevisionReader to obtain the text again.
 177     self.revision_reader_token = None
 178
 179   def get_first_on_branch_id(self):
 180     return self.parent_branch_data and self.parent_branch_data.id
 181
 182
 183 class _SymbolData:
 184   """Collection area for information about a symbol in a single CVSFile.
 185
 186   SYMBOL is an instance of Symbol, undifferentiated as a Branch or a
 187   Tag regardless of whether self is a _BranchData or a _TagData."""
 188
 189   def __init__(self, id, symbol):
 190     """Initialize an object for SYMBOL."""
 191
 192     # The unique id that will be used for this particular symbol in
 193     # this particular file.  This same id will be used for the CVSItem
 194     # that is derived from this instance.
 195     self.id = id
 196
 197     # An instance of Symbol.
 198     self.symbol = symbol
 199
 200
 201 class _BranchData(_SymbolData):
 202   """Collection area for information about a Branch in a single CVSFile."""
 203
 204   def __init__(self, id, symbol, branch_number):
 205     _SymbolData.__init__(self, id, symbol)
 206
 207     # The branch number (e.g., '1.5.2') of this branch.
 208     self.branch_number = branch_number
 209
 210     # The revision number of the revision from which this branch
 211     # sprouts (e.g., '1.5').
 212     self.parent = self.branch_number[:self.branch_number.rindex(".")]
 213
 214     # The revision number of the first commit on this branch, if any
 215     # (e.g., '1.5.2.1'); otherwise, None.
 216     self.child = None
 217
 218
 219 class _TagData(_SymbolData):
 220   """Collection area for information about a Tag in a single CVSFile."""
 221
 222   def __init__(self, id, symbol, rev):
 223     _SymbolData.__init__(self, id, symbol)
 224
 225     # The revision number being tagged (e.g., '1.5.2.3').
 226     self.rev = rev
 227
 228
 229 class _SymbolDataCollector(object):
 230   """Collect information about symbols in a single CVSFile."""
 231
 232   def __init__(self, fdc, cvs_file):
 233     self.fdc = fdc
 234     self.cvs_file = cvs_file
 235
 236     self.pdc = self.fdc.pdc
 237     self.collect_data = self.fdc.collect_data
 238
 239     # A list [(name, revision), ...] of symbols defined in the header
 240     # of the file.  The name has already been transformed using the
 241     # symbol transform rules.  If the symbol transform rules indicate
 242     # that the symbol should be ignored, then it is never added to
 243     # this list.  This list is processed then deleted in
 244     # process_symbols().
 245     self._symbol_defs = []
 246
 247     # A set containing the transformed names of symbols in this file
 248     # (used to detect duplicates during processing of unlabeled
 249     # branches):
 250     self._defined_symbols = set()
 251
 252     # Map { branch_number : _BranchData }, where branch_number has an
 253     # odd number of digits.
 254     self.branches_data = { }
 255
 256     # Map { revision : [ tag_data ] }, where revision has an even
 257     # number of digits, and the value is a list of _TagData objects
 258     # for tags that apply to that revision.
 259     self.tags_data = { }
 260
 261   def _add_branch(self, name, branch_number):
 262     """Record that BRANCH_NUMBER is the branch number for branch NAME,
 263     and derive and record the revision from which NAME sprouts.
 264     BRANCH_NUMBER is an RCS branch number with an odd number of
 265     components, for example '1.7.2' (never '1.7.0.2').  Return the
 266     _BranchData instance (which is usually newly-created)."""
 267
 268     branch_data = self.branches_data.get(branch_number)
 269
 270     if branch_data is not None:
 271       Log().warn(
 272           "%s: in '%s':\n"
 273           "   branch '%s' already has name '%s',\n"
 274           "   cannot also have name '%s', ignoring the latter\n"
 275           % (warning_prefix,
 276              self.cvs_file.filename, branch_number,
 277              branch_data.symbol.name, name)
 278           )
 279       return branch_data
 280
 281     symbol = self.pdc.get_symbol(name)
 282     branch_data = _BranchData(
 283         self.collect_data.item_key_generator.gen_id(), symbol, branch_number
 284         )
 285     self.branches_data[branch_number] = branch_data
 286     return branch_data
 287
 288   def _construct_distinct_name(self, name, original_name):
 289     """Construct a distinct symbol name from NAME.
 290
 291     If NAME is distinct, return it.  If it is already used in this
 292     file (as determined from its presence in self._defined_symbols),
 293     construct and return a new name that is not already used."""
 294
 295     if name not in self._defined_symbols:
 296       return name
 297     else:
 298       index = 1
 299       while True:
 300         dup_name = '%s-DUPLICATE-%d' % (name, index,)
 301         if dup_name not in self._defined_symbols:
 302           self.collect_data.record_fatal_error(
 303               "Symbol name '%s' is already used in '%s'.\n"
 304               "The unlabeled branch '%s' must be renamed using "
 305               "--symbol-transform."
 306               % (name, self.cvs_file.filename, original_name,)
 307               )
 308           return dup_name
 309
 310   def _add_unlabeled_branch(self, branch_number):
 311     original_name = "unlabeled-" + branch_number
 312     name = self.transform_symbol(original_name, branch_number)
 313     if name is None:
 314       self.collect_data.record_fatal_error(
 315           "The unlabeled branch '%s' in '%s' contains commits.\n"
 316           "It may not be ignored via a symbol transform.  (Use --exclude "
 317           "instead.)"
 318           % (original_name, self.cvs_file.filename,)
 319           )
 320       # Retain the original name to allow the conversion to continue:
 321       name = original_name
 322
 323     distinct_name = self._construct_distinct_name(name, original_name)
 324     self._defined_symbols.add(distinct_name)
 325     return self._add_branch(distinct_name, branch_number)
 326
 327   def _add_tag(self, name, revision):
 328     """Record that tag NAME refers to the specified REVISION."""
 329
 330     symbol = self.pdc.get_symbol(name)
 331     tag_data = _TagData(
 332         self.collect_data.item_key_generator.gen_id(), symbol, revision
 333         )
 334     self.tags_data.setdefault(revision, []).append(tag_data)
 335     return tag_data
 336
 337   def transform_symbol(self, name, revision):
 338     """Transform a symbol according to the project's symbol transforms.
 339
 340     Transform the symbol with the original name NAME and canonicalized
 341     revision number REVISION.  Return the new symbol name or None if
 342     the symbol should be ignored entirely.
 343
 344     Log the results of the symbol transform if necessary."""
 345
 346     old_name = name
 347     # Apply any user-defined symbol transforms to the symbol name:
 348     name = self.cvs_file.project.transform_symbol(
 349         self.cvs_file, name, revision
 350         )
 351
 352     if name is None:
 353       # Ignore symbol:
 354       self.pdc.log_symbol_transform(old_name, None)
 355       Log().verbose(
 356           "   symbol '%s'=%s ignored in %s"
 357           % (old_name, revision, self.cvs_file.filename,)
 358           )
 359     else:
 360       if name != old_name:
 361         self.pdc.log_symbol_transform(old_name, name)
 362         Log().verbose(
 363             "   symbol '%s'=%s transformed to '%s' in %s"
 364             % (old_name, revision, name, self.cvs_file.filename,)
 365             )
 366
 367     return name
 368
 369   def define_symbol(self, name, revision):
 370     """Record a symbol definition for later processing."""
 371
 372     # Canonicalize the revision number:
 373     revision = _branch_revision_re.sub(r'\1\2', revision)
 374
 375     # Apply any user-defined symbol transforms to the symbol name:
 376     name = self.transform_symbol(name, revision)
 377
 378     if name is not None:
 379       # Verify that the revision number is valid:
 380       if _valid_revision_re.match(revision):
 381         # The revision number is valid; record it for later processing:
 382         self._symbol_defs.append( (name, revision) )
 383       else:
 384         Log().warn(
 385             'In %r:\n'
 386             '    branch %r references invalid revision %s\n'
 387             '    and will be ignored.'
 388             % (self.cvs_file.filename, name, revision,)
 389             )
 390
 391   def _eliminate_trivial_duplicate_defs(self, symbol_defs):
 392     """Iterate through SYMBOL_DEFS, Removing identical duplicate definitions.
 393
 394     Duplicate definitions of symbol names have been seen in the wild,
 395     and they can also happen when --symbol-transform is used.  If a
 396     symbol is defined to the same revision number repeatedly, then
 397     ignore all but the last definition."""
 398
 399     # Make a copy, since we have to iterate through the definitions
 400     # twice:
 401     symbol_defs = list(symbol_defs)
 402
 403     # A map { (name, revision) : [index,...] } of the indexes where
 404     # symbol definitions name=revision were found:
 405     known_definitions = {}
 406     for (i, symbol_def) in enumerate(symbol_defs):
 407       known_definitions.setdefault(symbol_def, []).append(i)
 408
 409     # A set of the indexes of entries that have to be removed from
 410     # symbol_defs:
 411     dup_indexes = set()
 412     for ((name, revision), indexes) in known_definitions.iteritems():
 413       if len(indexes) > 1:
 414         Log().verbose(
 415             "in %r:\n"
 416             "   symbol %s:%s defined multiple times; ignoring duplicates\n"
 417             % (self.cvs_file.filename, name, revision,)
 418             )
 419         dup_indexes.update(indexes[:-1])
 420
 421     for (i, symbol_def) in enumerate(symbol_defs):
 422       if i not in dup_indexes:
 423         yield symbol_def
 424
 425   def _process_duplicate_defs(self, symbol_defs):
 426     """Iterate through SYMBOL_DEFS, processing duplicate names.
 427
 428     Duplicate definitions of symbol names have been seen in the wild,
 429     and they can also happen when --symbol-transform is used.  If a
 430     symbol is defined multiple times, then it is a fatal error.  This
 431     method should be called after _eliminate_trivial_duplicate_defs()."""
 432
 433     # Make a copy, since we have to access multiple times:
 434     symbol_defs = list(symbol_defs)
 435
 436     # A map {name : [index,...]} mapping the names of symbols to a
 437     # list of their definitions' indexes in symbol_defs:
 438     known_symbols = {}
 439     for (i, (name, revision)) in enumerate(symbol_defs):
 440       known_symbols.setdefault(name, []).append(i)
 441
 442     known_symbols = known_symbols.items()
 443     known_symbols.sort()
 444     dup_indexes = set()
 445     for (name, indexes) in known_symbols:
 446       if len(indexes) > 1:
 447         # This symbol was defined multiple times.
 448         self.collect_data.record_fatal_error(
 449             "Multiple definitions of the symbol '%s' in '%s': %s" % (
 450                 name, self.cvs_file.filename,
 451                 ' '.join([symbol_defs[i][1] for i in indexes]),
 452                 )
 453             )
 454         # Ignore all but the last definition for now, to allow the
 455         # conversion to proceed:
 456         dup_indexes.update(indexes[:-1])
 457
 458     for (i, symbol_def) in enumerate(symbol_defs):
 459       if i not in dup_indexes:
 460         yield symbol_def
 461
 462   def _process_symbol(self, name, revision):
 463     """Process a symbol called NAME, which is associated with REVISON.
 464
 465     REVISION is a canonical revision number with zeros removed, for
 466     example: '1.7', '1.7.2', or '1.1.1' or '1.1.1.1'.  NAME is a
 467     transformed branch or tag name."""
 468
 469     # Add symbol to our records:
 470     if is_branch_revision_number(revision):
 471       self._add_branch(name, revision)
 472     else:
 473       self._add_tag(name, revision)
 474
 475   def process_symbols(self):
 476     """Process the symbol definitions from SELF._symbol_defs."""
 477
 478     symbol_defs = self._symbol_defs
 479     del self._symbol_defs
 480
 481     symbol_defs = self._eliminate_trivial_duplicate_defs(symbol_defs)
 482     symbol_defs = self._process_duplicate_defs(symbol_defs)
 483
 484     for (name, revision) in symbol_defs:
 485       self._defined_symbols.add(name)
 486       self._process_symbol(name, revision)
 487
 488   @staticmethod
 489   def rev_to_branch_number(revision):
 490     """Return the branch_number of the branch on which REVISION lies.
 491
 492     REVISION is a branch revision number with an even number of
 493     components; for example '1.7.2.1' (never '1.7.2' nor '1.7.0.2').
 494     The return value is the branch number (for example, '1.7.2').
 495     Return none iff REVISION is a trunk revision such as '1.2'."""
 496
 497     if is_trunk_revision(revision):
 498       return None
 499     return revision[:revision.rindex(".")]
 500
 501   def rev_to_branch_data(self, revision):
 502     """Return the branch_data of the branch on which REVISION lies.
 503
 504     REVISION must be a branch revision number with an even number of
 505     components; for example '1.7.2.1' (never '1.7.2' nor '1.7.0.2').
 506     Raise KeyError iff REVISION is unknown."""
 507
 508     assert not is_trunk_revision(revision)
 509
 510     return self.branches_data[self.rev_to_branch_number(revision)]
 511
 512   def rev_to_lod(self, revision):
 513     """Return the line of development on which REVISION lies.
 514
 515     REVISION must be a revision number with an even number of
 516     components.  Raise KeyError iff REVISION is unknown."""
 517
 518     if is_trunk_revision(revision):
 519       return self.pdc.trunk
 520     else:
 521       return self.rev_to_branch_data(revision).symbol
 522
 523
 524 class _FileDataCollector(cvs2svn_rcsparse.Sink):
 525   """Class responsible for collecting RCS data for a particular file.
 526
 527   Any collected data that need to be remembered are stored into the
 528   referenced CollectData instance."""
 529
 530   def __init__(self, pdc, cvs_file):
 531     """Create an object that is prepared to receive data for CVS_FILE.
 532     CVS_FILE is a CVSFile instance.  COLLECT_DATA is used to store the
 533     information collected about the file."""
 534
 535     self.pdc = pdc
 536     self.cvs_file = cvs_file
 537
 538     self.collect_data = self.pdc.collect_data
 539     self.project = self.cvs_file.project
 540
 541     # A place to store information about the symbols in this file:
 542     self.sdc = _SymbolDataCollector(self, self.cvs_file)
 543
 544     # { revision : _RevisionData instance }
 545     self._rev_data = { }
 546
 547     # Lists [ (parent, child) ] of revision number pairs indicating
 548     # that revision child depends on revision parent along the main
 549     # line of development.
 550     self._primary_dependencies = []
 551
 552     # If set, this is an RCS branch number -- rcsparse calls this the
 553     # "principal branch", but CVS and RCS refer to it as the "default
 554     # branch", so that's what we call it, even though the rcsparse API
 555     # setter method is still 'set_principal_branch'.
 556     self.default_branch = None
 557
 558     # True iff revision 1.1 of the file appears to have been imported
 559     # (as opposed to added normally).
 560     self._file_imported = False
 561
 562   def _get_rev_id(self, revision):
 563     if revision is None:
 564       return None
 565     return self._rev_data[revision].cvs_rev_id
 566
 567   def set_principal_branch(self, branch):
 568     """This is a callback method declared in Sink."""
 569
 570     if branch.find('.') == -1:
 571       # This just sets the default branch to trunk.  Normally this
 572       # shouldn't occur, but it has been seen in at least one CVS
 573       # repository.  Just ignore it.
 574       pass
 575     else:
 576       self.default_branch = branch
 577
 578   def define_tag(self, name, revision):
 579     """Remember the symbol name and revision, but don't process them yet.
 580
 581     This is a callback method declared in Sink."""
 582
 583     self.sdc.define_symbol(name, revision)
 584
 585   def set_expansion(self, mode):
 586     """This is a callback method declared in Sink."""
 587
 588     self.cvs_file.mode = mode
 589
 590   def admin_completed(self):
 591     """This is a callback method declared in Sink."""
 592
 593     self.sdc.process_symbols()
 594
 595   def define_revision(self, revision, timestamp, author, state,
 596                       branches, next):
 597     """This is a callback method declared in Sink."""
 598
 599     for branch in branches:
 600       try:
 601         branch_data = self.sdc.rev_to_branch_data(branch)
 602       except KeyError:
 603         # Normally we learn about the branches from the branch names
 604         # and numbers parsed from the symbolic name header.  But this
 605         # must have been an unlabeled branch that slipped through the
 606         # net.  Generate a name for it and create a _BranchData record
 607         # for it now.
 608         branch_data = self.sdc._add_unlabeled_branch(
 609             self.sdc.rev_to_branch_number(branch))
 610
 611       assert branch_data.child is None
 612       branch_data.child = branch
 613
 614     if revision in self._rev_data:
 615       # This revision has already been seen.
 616       Log().error('File %r contains duplicate definitions of revision %s.'
 617                   % (self.cvs_file.filename, revision,))
 618       raise RuntimeError
 619
 620     # Record basic information about the revision:
 621     rev_data = _RevisionData(
 622         self.collect_data.item_key_generator.gen_id(),
 623         revision, int(timestamp), author, state)
 624     self._rev_data[revision] = rev_data
 625
 626     # When on trunk, the RCS 'next' revision number points to what
 627     # humans might consider to be the 'previous' revision number.  For
 628     # example, 1.3's RCS 'next' is 1.2.
 629     #
 630     # However, on a branch, the RCS 'next' revision number really does
 631     # point to what humans would consider to be the 'next' revision
 632     # number.  For example, 1.1.2.1's RCS 'next' would be 1.1.2.2.
 633     #
 634     # In other words, in RCS, 'next' always means "where to find the next
 635     # deltatext that you need this revision to retrieve.
 636     #
 637     # That said, we don't *want* RCS's behavior here, so we determine
 638     # whether we're on trunk or a branch and set the dependencies
 639     # accordingly.
 640     if next:
 641       if is_trunk_revision(revision):
 642         self._primary_dependencies.append( (next, revision,) )
 643       else:
 644         self._primary_dependencies.append( (revision, next,) )
 645
 646   def tree_completed(self):
 647     """The revision tree has been parsed.
 648
 649     Analyze it for consistency and connect some loose ends.
 650
 651     This is a callback method declared in Sink."""
 652
 653     self._resolve_primary_dependencies()
 654     self._resolve_branch_dependencies()
 655     self._sort_branches()
 656     self._resolve_tag_dependencies()
 657
 658     # Compute the preliminary CVSFileItems for this file:
 659     cvs_items = []
 660     cvs_items.extend(self._get_cvs_revisions())
 661     cvs_items.extend(self._get_cvs_branches())
 662     cvs_items.extend(self._get_cvs_tags())
 663     self._cvs_file_items = CVSFileItems(
 664         self.cvs_file, self.pdc.trunk, cvs_items
 665         )
 666
 667     self._cvs_file_items.check_link_consistency()
 668
 669   def _resolve_primary_dependencies(self):
 670     """Resolve the dependencies listed in self._primary_dependencies."""
 671
 672     for (parent, child,) in self._primary_dependencies:
 673       parent_data = self._rev_data[parent]
 674       assert parent_data.child is None
 675       parent_data.child = child
 676
 677       child_data = self._rev_data[child]
 678       assert child_data.parent is None
 679       child_data.parent = parent
 680
 681   def _resolve_branch_dependencies(self):
 682     """Resolve dependencies involving branches."""
 683
 684     for branch_data in self.sdc.branches_data.values():
 685       # The branch_data's parent has the branch as a child regardless
 686       # of whether the branch had any subsequent commits:
 687       try:
 688         parent_data = self._rev_data[branch_data.parent]
 689       except KeyError:
 690         Log().warn(
 691             'In %r:\n'
 692             '    branch %r references non-existing revision %s\n'
 693             '    and will be ignored.'
 694             % (self.cvs_file.filename, branch_data.symbol.name,
 695                branch_data.parent,))
 696         del self.sdc.branches_data[branch_data.branch_number]
 697       else:
 698         parent_data.branches_data.append(branch_data)
 699
 700         # If the branch has a child (i.e., something was committed on
 701         # the branch), then we store a reference to the branch_data
 702         # there, define the child's parent to be the branch's parent,
 703         # and list the child in the branch parent's branches_revs_data:
 704         if branch_data.child is not None:
 705           child_data = self._rev_data[branch_data.child]
 706           assert child_data.parent_branch_data is None
 707           child_data.parent_branch_data = branch_data
 708           assert child_data.parent is None
 709           child_data.parent = branch_data.parent
 710           parent_data.branches_revs_data.append(branch_data.child)
 711
 712   def _sort_branches(self):
 713     """Sort the branches sprouting from each revision in creation order.
 714
 715     Creation order is taken to be the reverse of the order that they
 716     are listed in the symbols part of the RCS file.  (If a branch is
 717     created then deleted, a later branch can be assigned the recycled
 718     branch number; therefore branch numbers are not an indication of
 719     creation order.)"""
 720
 721     for rev_data in self._rev_data.values():
 722       rev_data.branches_data.sort(lambda a, b: - cmp(a.id, b.id))
 723
 724   def _resolve_tag_dependencies(self):
 725     """Resolve dependencies involving tags."""
 726
 727     for (rev, tag_data_list) in self.sdc.tags_data.items():
 728       try:
 729         parent_data = self._rev_data[rev]
 730       except KeyError:
 731         Log().warn(
 732             'In %r:\n'
 733             '    the following tag(s) reference non-existing revision %s\n'
 734             '    and will be ignored:\n'
 735             '    %s' % (
 736                 self.cvs_file.filename, rev,
 737                 ', '.join([repr(tag_data.symbol.name)
 738                            for tag_data in tag_data_list]),))
 739         del self.sdc.tags_data[rev]
 740       else:
 741         for tag_data in tag_data_list:
 742           assert tag_data.rev == rev
 743           # The tag_data's rev has the tag as a child:
 744           parent_data.tags_data.append(tag_data)
 745
 746   def _get_cvs_branches(self):
 747     """Generate the CVSBranches present in this file."""
 748
 749     for branch_data in self.sdc.branches_data.values():
 750       yield CVSBranch(
 751           branch_data.id, self.cvs_file, branch_data.symbol,
 752           branch_data.branch_number,
 753           self.sdc.rev_to_lod(branch_data.parent),
 754           self._get_rev_id(branch_data.parent),
 755           self._get_rev_id(branch_data.child),
 756           None,
 757           )
 758
 759   def _get_cvs_tags(self):
 760     """Generate the CVSTags present in this file."""
 761
 762     for tags_data in self.sdc.tags_data.values():
 763       for tag_data in tags_data:
 764         yield CVSTag(
 765             tag_data.id, self.cvs_file, tag_data.symbol,
 766             self.sdc.rev_to_lod(tag_data.rev),
 767             self._get_rev_id(tag_data.rev),
 768             None,
 769             )
 770
 771   def set_description(self, description):
 772     """This is a callback method declared in Sink."""
 773
 774     self.cvs_file.description = description
 775     self.cvs_file.determine_file_properties(Ctx().file_property_setters)
 776
 777   def set_revision_info(self, revision, log, text):
 778     """This is a callback method declared in Sink."""
 779
 780     rev_data = self._rev_data[revision]
 781     cvs_rev = self._cvs_file_items[rev_data.cvs_rev_id]
 782
 783     if cvs_rev.metadata_id is not None:
 784       # Users have reported problems with repositories in which the
 785       # deltatext block for revision 1.1 appears twice.  It is not
 786       # known whether this results from a CVS/RCS bug, or from botched
 787       # hand-editing of the repository.  In any case, empirically, cvs
 788       # and rcs both use the first version when checking out data, so
 789       # that's what we will do.  (For the record: "cvs log" fails on
 790       # such a file; "rlog" prints the log message from the first
 791       # block and ignores the second one.)
 792       Log().warn(
 793           "%s: in '%s':\n"
 794           "   Deltatext block for revision %s appeared twice;\n"
 795           "   ignoring the second occurrence.\n"
 796           % (warning_prefix, self.cvs_file.filename, revision,)
 797           )
 798       return
 799
 800     if is_trunk_revision(revision):
 801       branch_name = None
 802     else:
 803       branch_name = self.sdc.rev_to_branch_data(revision).symbol.name
 804
 805     cvs_rev.metadata_id = self.collect_data.metadata_logger.store(
 806         self.project, branch_name, rev_data.author, log
 807         )
 808     cvs_rev.deltatext_exists = bool(text)
 809
 810     # If this is revision 1.1, determine whether the file appears to
 811     # have been created via 'cvs add' instead of 'cvs import'.  The
 812     # test is that the log message CVS uses for 1.1 in imports is
 813     # "Initial revision\n" with no period.  (This fact helps determine
 814     # whether this file might have had a default branch in the past.)
 815     if revision == '1.1':
 816       self._file_imported = (log == 'Initial revision\n')
 817
 818   def parse_completed(self):
 819     """Finish the processing of this file.
 820
 821     This is a callback method declared in Sink."""
 822
 823     # Make sure that there was an info section for each revision:
 824     for cvs_item in self._cvs_file_items.values():
 825       if isinstance(cvs_item, CVSRevision) and cvs_item.metadata_id is None:
 826         self.collect_data.record_fatal_error(
 827             '%r has no deltatext section for revision %s'
 828             % (self.cvs_file.filename, cvs_item.rev,)
 829             )
 830
 831   def _determine_operation(self, rev_data):
 832     prev_rev_data = self._rev_data.get(rev_data.parent)
 833     return cvs_revision_type_map[(
 834         rev_data.state != 'dead',
 835         prev_rev_data is not None and prev_rev_data.state != 'dead',
 836         )]
 837
 838   def _get_cvs_revisions(self):
 839     """Generate the CVSRevisions present in this file."""
 840
 841     for rev_data in self._rev_data.itervalues():
 842       yield self._get_cvs_revision(rev_data)
 843
 844   def _get_cvs_revision(self, rev_data):
 845     """Create and return a CVSRevision for REV_DATA."""
 846
 847     branch_ids = [
 848         branch_data.id
 849         for branch_data in rev_data.branches_data
 850         ]
 851
 852     branch_commit_ids = [
 853         self._get_rev_id(rev)
 854         for rev in rev_data.branches_revs_data
 855         ]
 856
 857     tag_ids = [
 858         tag_data.id
 859         for tag_data in rev_data.tags_data
 860         ]
 861
 862     revision_type = self._determine_operation(rev_data)
 863
 864     return revision_type(
 865         self._get_rev_id(rev_data.rev), self.cvs_file,
 866         rev_data.timestamp, None,
 867         self._get_rev_id(rev_data.parent),
 868         self._get_rev_id(rev_data.child),
 869         rev_data.rev,
 870         True,
 871         self.sdc.rev_to_lod(rev_data.rev),
 872         rev_data.get_first_on_branch_id(),
 873         False, None, None,
 874         tag_ids, branch_ids, branch_commit_ids,
 875         rev_data.revision_reader_token
 876         )
 877
 878   def get_cvs_file_items(self):
 879     """Finish up and return a CVSFileItems instance for this file.
 880
 881     This method must only be called once."""
 882
 883     self._process_ntdbrs()
 884
 885     # Break a circular reference loop, allowing the memory for self
 886     # and sdc to be freed.
 887     del self.sdc
 888
 889     return self._cvs_file_items
 890
 891   def _process_ntdbrs(self):
 892     """Fix up any non-trunk default branch revisions (if present).
 893
 894     If a non-trunk default branch is determined to have existed, yield
 895     the _RevisionData.ids for all revisions that were once non-trunk
 896     default revisions, in dependency order.
 897
 898     There are two cases to handle:
 899
 900     One case is simple.  The RCS file lists a default branch
 901     explicitly in its header, such as '1.1.1'.  In this case, we know
 902     that every revision on the vendor branch is to be treated as head
 903     of trunk at that point in time.
 904
 905     But there's also a degenerate case.  The RCS file does not
 906     currently have a default branch, yet we can deduce that for some
 907     period in the past it probably *did* have one.  For example, the
 908     file has vendor revisions 1.1.1.1 -> 1.1.1.96, all of which are
 909     dated before 1.2, and then it has 1.1.1.97 -> 1.1.1.100 dated
 910     after 1.2.  In this case, we should record 1.1.1.96 as the last
 911     vendor revision to have been the head of the default branch.
 912
 913     If any non-trunk default branch revisions are found:
 914
 915     - Set their ntdbr members to True.
 916
 917     - Connect the last one with revision 1.2.
 918
 919     - Remove revision 1.1 if it is not needed.
 920
 921     """
 922
 923     try:
 924       if self.default_branch:
 925         vendor_cvs_branch_id = self.sdc.branches_data[self.default_branch].id
 926         vendor_lod_items = self._cvs_file_items.get_lod_items(
 927             self._cvs_file_items[vendor_cvs_branch_id]
 928             )
 929         if not self._cvs_file_items.process_live_ntdb(vendor_lod_items):
 930           return
 931       elif self._file_imported:
 932         vendor_branch_data = self.sdc.branches_data.get('1.1.1')
 933         if vendor_branch_data is None:
 934           return
 935         else:
 936           vendor_lod_items = self._cvs_file_items.get_lod_items(
 937               self._cvs_file_items[vendor_branch_data.id]
 938               )
 939           if not self._cvs_file_items.process_historical_ntdb(
 940                 vendor_lod_items
 941                 ):
 942             return
 943       else:
 944         return
 945     except VendorBranchError, e:
 946       self.collect_data.record_fatal_error(str(e))
 947       return
 948
 949     if self._file_imported:
 950       self._cvs_file_items.imported_remove_1_1(vendor_lod_items)
 951
 952     self._cvs_file_items.check_link_consistency()
 953
 954
 955 class _ProjectDataCollector:
 956   def __init__(self, collect_data, project):
 957     self.collect_data = collect_data
 958     self.project = project
 959     self.num_files = 0
 960
 961     # The Trunk LineOfDevelopment object for this project:
 962     self.trunk = Trunk(
 963         self.collect_data.symbol_key_generator.gen_id(), self.project
 964         )
 965     self.project.trunk_id = self.trunk.id
 966
 967     # This causes a record for self.trunk to spring into existence:
 968     self.collect_data.register_trunk(self.trunk)
 969
 970     # A map { name -> Symbol } for all known symbols in this project.
 971     # The symbols listed here are undifferentiated into Branches and
 972     # Tags because the same name might appear as a branch in one file
 973     # and a tag in another.
 974     self.symbols = {}
 975
 976     # A map { (old_name, new_name) : count } indicating how many files
 977     # were affected by each each symbol name transformation:
 978     self.symbol_transform_counts = {}
 979
 980   def get_symbol(self, name):
 981     """Return the Symbol object for the symbol named NAME in this project.
 982
 983     If such a symbol does not yet exist, allocate a new symbol_id,
 984     create a Symbol instance, store it in self.symbols, and return it."""
 985
 986     symbol = self.symbols.get(name)
 987     if symbol is None:
 988       symbol = Symbol(
 989           self.collect_data.symbol_key_generator.gen_id(),
 990           self.project, name)
 991       self.symbols[name] = symbol
 992     return symbol
 993
 994   def log_symbol_transform(self, old_name, new_name):
 995     """Record that OLD_NAME was transformed to NEW_NAME in one file.
 996
 997     This information is used to generated a statistical summary of
 998     symbol transforms."""
 999
1000     try:
1001       self.symbol_transform_counts[old_name, new_name] += 1
1002     except KeyError:
1003       self.symbol_transform_counts[old_name, new_name] = 1
1004
1005   def summarize_symbol_transforms(self):
1006     if self.symbol_transform_counts and Log().is_on(Log.NORMAL):
1007       log = Log()
1008       log.normal('Summary of symbol transforms:')
1009       transforms = self.symbol_transform_counts.items()
1010       transforms.sort()
1011       for ((old_name, new_name), count) in transforms:
1012         if new_name is None:
1013           log.normal('    "%s" ignored in %d files' % (old_name, count,))
1014         else:
1015           log.normal(
1016               '    "%s" transformed to "%s" in %d files'
1017               % (old_name, new_name, count,)
1018               )
1019
1020   def process_file(self, cvs_file):
1021     Log().normal(cvs_file.filename)
1022     fdc = _FileDataCollector(self, cvs_file)
1023     try:
1024       cvs2svn_rcsparse.parse(open(cvs_file.filename, 'rb'), fdc)
1025     except (cvs2svn_rcsparse.common.RCSParseError, ValueError, RuntimeError):
1026       self.collect_data.record_fatal_error(
1027           "%r is not a valid ,v file" % (cvs_file.filename,)
1028           )
1029       # Abort the processing of this file, but let the pass continue
1030       # with other files:
1031       return
1032     except:
1033       Log().warn("Exception occurred while parsing %s" % cvs_file.filename)
1034       raise
1035     else:
1036       self.num_files += 1
1037
1038     return fdc.get_cvs_file_items()
1039
1040
1041 class CollectData:
1042   """Repository for data collected by parsing the CVS repository files.
1043
1044   This class manages the databases into which information collected
1045   from the CVS repository is stored.  The data are stored into this
1046   class by _FileDataCollector instances, one of which is created for
1047   each file to be parsed."""
1048
1049   def __init__(self, stats_keeper):
1050     self._cvs_item_store = NewCVSItemStore(
1051         artifact_manager.get_temp_file(config.CVS_ITEMS_STORE))
1052     self.metadata_db = MetadataDatabase(
1053         artifact_manager.get_temp_file(config.METADATA_STORE),
1054         artifact_manager.get_temp_file(config.METADATA_INDEX_TABLE),
1055         DB_OPEN_NEW,
1056         )
1057     self.metadata_logger = MetadataLogger(self.metadata_db)
1058     self.fatal_errors = []
1059     self.num_files = 0
1060     self.symbol_stats = SymbolStatisticsCollector()
1061     self.stats_keeper = stats_keeper
1062
1063     # Key generator for CVSFiles:
1064     self.file_key_generator = KeyGenerator()
1065
1066     # Key generator for CVSItems:
1067     self.item_key_generator = KeyGenerator()
1068
1069     # Key generator for Symbols:
1070     self.symbol_key_generator = KeyGenerator()
1071
1072   def record_fatal_error(self, err):
1073     """Record that fatal error ERR was found.
1074
1075     ERR is a string (without trailing newline) describing the error.
1076     Output the error to stderr immediately, and record a copy to be
1077     output again in a summary at the end of CollectRevsPass."""
1078
1079     err = '%s: %s' % (error_prefix, err,)
1080     Log().error(err + '\n')
1081     self.fatal_errors.append(err)
1082
1083   def add_cvs_directory(self, cvs_directory):
1084     """Record CVS_DIRECTORY."""
1085
1086     Ctx()._cvs_path_db.log_path(cvs_directory)
1087
1088   def add_cvs_file_items(self, cvs_file_items):
1089     """Record the information from CVS_FILE_ITEMS.
1090
1091     Store the CVSFile to _cvs_path_db under its persistent id, store
1092     the CVSItems, and record the CVSItems to self.stats_keeper."""
1093
1094     Ctx()._cvs_path_db.log_path(cvs_file_items.cvs_file)
1095     self._cvs_item_store.add(cvs_file_items)
1096
1097     self.stats_keeper.record_cvs_file(cvs_file_items.cvs_file)
1098     for cvs_item in cvs_file_items.values():
1099       self.stats_keeper.record_cvs_item(cvs_item)
1100
1101   def register_trunk(self, trunk):
1102     """Create a symbol statistics record for the specified trunk LOD."""
1103
1104     # This causes a record to spring into existence:
1105     self.symbol_stats[trunk]
1106
1107   def _process_cvs_file_items(self, cvs_file_items):
1108     """Process the CVSFileItems from one CVSFile."""
1109
1110     # Remove an initial delete on trunk if it is not needed:
1111     cvs_file_items.remove_unneeded_initial_trunk_delete(self.metadata_db)
1112
1113     # Remove initial branch deletes that are not needed:
1114     cvs_file_items.remove_initial_branch_deletes(self.metadata_db)
1115
1116     # If this is a --trunk-only conversion, discard all branches and
1117     # tags, then draft any non-trunk default branch revisions to
1118     # trunk:
1119     if Ctx().trunk_only:
1120       cvs_file_items.exclude_non_trunk()
1121
1122     cvs_file_items.check_link_consistency()
1123
1124     self.add_cvs_file_items(cvs_file_items)
1125     self.symbol_stats.register(cvs_file_items)
1126
1127   def process_project(self, project):
1128     Ctx()._projects[project.id] = project
1129
1130     pdc = _ProjectDataCollector(self, project)
1131
1132     found_rcs_file = False
1133     for cvs_path in walk_repository(
1134           project, self.file_key_generator, self.record_fatal_error
1135           ):
1136       if isinstance(cvs_path, CVSDirectory):
1137         self.add_cvs_directory(cvs_path)
1138       else:
1139         cvs_file_items = pdc.process_file(cvs_path)
1140         self._process_cvs_file_items(cvs_file_items)
1141         found_rcs_file = True
1142
1143     if not found_rcs_file:
1144       self.record_fatal_error(
1145           'No RCS files found under %r!\n'
1146           'Are you absolutely certain you are pointing cvs2svn\n'
1147           'at a CVS repository?\n'
1148           % (project.project_cvs_repos_path,)
1149           )
1150
1151     pdc.summarize_symbol_transforms()
1152
1153     self.num_files += pdc.num_files
1154     Log().verbose('Processed', self.num_files, 'files')
1155
1156   def _register_empty_subdirectories(self):
1157     """Set the CVSDirectory.empty_subdirectory_id members."""
1158
1159     directories = set(
1160         path
1161         for path in Ctx()._cvs_path_db.itervalues()
1162         if isinstance(path, CVSDirectory)
1163         )
1164     for path in Ctx()._cvs_path_db.itervalues():
1165       if isinstance(path, CVSFile):
1166         directory = path.parent_directory
1167         while directory is not None and directory in directories:
1168           directories.remove(directory)
1169           directory = directory.parent_directory
1170     for directory in directories:
1171       if directory.parent_directory is not None:
1172         directory.parent_directory.empty_subdirectory_ids.append(directory.id)
1173
1174   def _set_cvs_path_ordinals(self):
1175     cvs_files = list(Ctx()._cvs_path_db.itervalues())
1176     cvs_files.sort(CVSPath.slow_compare)
1177     for (i, cvs_file) in enumerate(cvs_files):
1178       cvs_file.ordinal = i
1179
1180   def close(self):
1181     """Close the data structures associated with this instance.
1182
1183     Return a list of fatal errors encountered while processing input.
1184     Each list entry is a string describing one fatal error."""
1185
1186     self.symbol_stats.purge_ghost_symbols()
1187     self.symbol_stats.close()
1188     self.symbol_stats = None
1189     self.metadata_logger = None
1190     self.metadata_db.close()
1191     self.metadata_db = None
1192     self._cvs_item_store.close()
1193     self._cvs_item_store = None
1194     self._register_empty_subdirectories()
1195     self._set_cvs_path_ordinals()
1196     retval = self.fatal_errors
1197     self.fatal_errors = None
1198     return retval
1199
1200