cvs2svn_lib/collect_data.py

   1 # (Be in -*- python -*- mode.)
   2 #
   3 # ====================================================================
   4 # Copyright (c) 2000-2009 CollabNet.  All rights reserved.
   5 #
   6 # This software is licensed as described in the file COPYING, which
   7 # you should have received as part of this distribution.  The terms
   8 # are also available at http://subversion.tigris.org/license-1.html.
   9 # If newer versions of this license are posted there, you may use a
  10 # newer version instead, at your option.
  11 #
  12 # This software consists of voluntary contributions made by many
  13 # individuals.  For exact contribution history, see the revision
  14 # history and logs, available at http://cvs2svn.tigris.org/.
  15 # ====================================================================
  16
  17 """Data collection classes.
  18
  19 This module contains the code used to collect data from the CVS
  20 repository.  It parses *,v files, recording all useful information
  21 except for the actual file contents (though even the file contents
  22 might be recorded by the RevisionRecorder if one is configured).
  23
  24 As a *,v file is parsed, the information pertaining to the file is
  25 accumulated in memory, mostly in _RevisionData, _BranchData, and
  26 _TagData objects.  When parsing is complete, a final pass is made over
  27 the data to create some final dependency links, collect statistics,
  28 etc., then the _*Data objects are converted into CVSItem objects
  29 (CVSRevision, CVSBranch, and CVSTag respectively) and the CVSItems are
  30 dumped into databases.
  31
  32 During the data collection, persistent unique ids are allocated to
  33 many types of objects: CVSFile, Symbol, and CVSItems.  CVSItems are a
  34 special case.  CVSItem ids are unique across all CVSItem types, and
  35 the ids are carried over from the corresponding data collection
  36 objects:
  37
  38     _RevisionData -> CVSRevision
  39
  40     _BranchData -> CVSBranch
  41
  42     _TagData -> CVSTag
  43
  44 In a later pass it is possible to convert tags <-> branches.  But even
  45 if this occurs, the new branch or tag uses the same id as the old tag
  46 or branch.
  47
  48 """
  49
  50
  51 import os
  52 import stat
  53 import re
  54
  55 from cvs2svn_lib import config
  56 from cvs2svn_lib.common import DB_OPEN_NEW
  57 from cvs2svn_lib.common import warning_prefix
  58 from cvs2svn_lib.common import error_prefix
  59 from cvs2svn_lib.common import is_trunk_revision
  60 from cvs2svn_lib.log import Log
  61 from cvs2svn_lib.context import Ctx
  62 from cvs2svn_lib.artifact_manager import artifact_manager
  63 from cvs2svn_lib.cvs_path import CVSPath
  64 from cvs2svn_lib.cvs_path import CVSFile
  65 from cvs2svn_lib.cvs_path import CVSDirectory
  66 from cvs2svn_lib.symbol import Symbol
  67 from cvs2svn_lib.symbol import Trunk
  68 from cvs2svn_lib.cvs_item import CVSRevision
  69 from cvs2svn_lib.cvs_item import CVSBranch
  70 from cvs2svn_lib.cvs_item import CVSTag
  71 from cvs2svn_lib.cvs_item import cvs_revision_type_map
  72 from cvs2svn_lib.cvs_file_items import VendorBranchError
  73 from cvs2svn_lib.cvs_file_items import CVSFileItems
  74 from cvs2svn_lib.key_generator import KeyGenerator
  75 from cvs2svn_lib.cvs_item_database import NewCVSItemStore
  76 from cvs2svn_lib.symbol_statistics import SymbolStatisticsCollector
  77 from cvs2svn_lib.metadata_database import MetadataDatabase
  78 from cvs2svn_lib.metadata_database import MetadataLogger
  79 from cvs2svn_lib.repository_walker import walk_repository
  80
  81 import cvs2svn_rcsparse
  82
  83
  84 # A regular expression defining "valid" revision numbers (used to
  85 # check that symbol definitions are reasonable).
  86 _valid_revision_re = re.compile(r'''
  87     ^
  88     (?:\d+\.)+          # Digit groups with trailing dots
  89     \d+                 # And the last digit group.
  90     $
  91     ''', re.VERBOSE)
  92
  93 _branch_revision_re = re.compile(r'''
  94     ^
  95     ((?:\d+\.\d+\.)+)   # A nonzero even number of digit groups w/trailing dot
  96     (?:0\.)?            # CVS sticks an extra 0 here; RCS does not
  97     (\d+)               # And the last digit group
  98     $
  99     ''', re.VERBOSE)
 100
 101
 102 def is_branch_revision_number(rev):
 103   """Return True iff REV is a branch revision number.
 104
 105   REV is a CVS revision number in canonical form (i.e., with zeros
 106   removed).  Return True iff it refers to a whole branch, as opposed
 107   to a single revision."""
 108
 109   return rev.count('.') % 2 == 0
 110
 111
 112 def is_same_line_of_development(rev1, rev2):
 113   """Return True if rev1 and rev2 are on the same line of
 114   development (i.e., both on trunk, or both on the same branch);
 115   return False otherwise.  Either rev1 or rev2 can be None, in
 116   which case automatically return False."""
 117
 118   if rev1 is None or rev2 is None:
 119     return False
 120   if rev1.count('.') == 1 and rev2.count('.') == 1:
 121     return True
 122   if rev1[0:rev1.rfind('.')] == rev2[0:rev2.rfind('.')]:
 123     return True
 124   return False
 125
 126
 127 class _RevisionData:
 128   """We track the state of each revision so that in set_revision_info,
 129   we can determine if our op is an add/change/delete.  We can do this
 130   because in set_revision_info, we'll have all of the _RevisionData
 131   for a file at our fingertips, and we need to examine the state of
 132   our prev_rev to determine if we're an add or a change.  Without the
 133   state of the prev_rev, we are unable to distinguish between an add
 134   and a change."""
 135
 136   def __init__(self, cvs_rev_id, rev, timestamp, author, state):
 137     # The id of this revision:
 138     self.cvs_rev_id = cvs_rev_id
 139     self.rev = rev
 140     self.timestamp = timestamp
 141     self.author = author
 142     self.original_timestamp = timestamp
 143     self.state = state
 144
 145     # If this is the first revision on a branch, then this is the
 146     # branch_data of that branch; otherwise it is None.
 147     self.parent_branch_data = None
 148
 149     # The revision number of the parent of this revision along the
 150     # same line of development, if any.  For the first revision R on a
 151     # branch, we consider the revision from which R sprouted to be the
 152     # 'parent'.  If this is the root revision in the file's revision
 153     # tree, then this field is None.
 154     #
 155     # Note that this revision can't be determined arithmetically (due
 156     # to cvsadmin -o), which is why this field is necessary.
 157     self.parent = None
 158
 159     # The revision number of the primary child of this revision (the
 160     # child along the same line of development), if any; otherwise,
 161     # None.
 162     self.child = None
 163
 164     # The _BranchData instances of branches that sprout from this
 165     # revision, sorted in ascending order by branch number.  It would
 166     # be inconvenient to initialize it here because we would have to
 167     # scan through all branches known by the _SymbolDataCollector to
 168     # find the ones having us as the parent.  Instead, this
 169     # information is filled in by
 170     # _FileDataCollector._resolve_dependencies() and sorted by
 171     # _FileDataCollector._sort_branches().
 172     self.branches_data = []
 173
 174     # The revision numbers of the first commits on any branches on
 175     # which commits occurred.  This dependency is kept explicitly
 176     # because otherwise a revision-only topological sort would miss
 177     # the dependency that exists via branches_data.
 178     self.branches_revs_data = []
 179
 180     # The _TagData instances of tags that are connected to this
 181     # revision.
 182     self.tags_data = []
 183
 184     # A token that may be returned from
 185     # RevisionRecorder.record_text().  It can be used by
 186     # RevisionReader to obtain the text again.
 187     self.revision_recorder_token = None
 188
 189   def get_first_on_branch_id(self):
 190     return self.parent_branch_data and self.parent_branch_data.id
 191
 192
 193 class _SymbolData:
 194   """Collection area for information about a symbol in a single CVSFile.
 195
 196   SYMBOL is an instance of Symbol, undifferentiated as a Branch or a
 197   Tag regardless of whether self is a _BranchData or a _TagData."""
 198
 199   def __init__(self, id, symbol):
 200     """Initialize an object for SYMBOL."""
 201
 202     # The unique id that will be used for this particular symbol in
 203     # this particular file.  This same id will be used for the CVSItem
 204     # that is derived from this instance.
 205     self.id = id
 206
 207     # An instance of Symbol.
 208     self.symbol = symbol
 209
 210
 211 class _BranchData(_SymbolData):
 212   """Collection area for information about a Branch in a single CVSFile."""
 213
 214   def __init__(self, id, symbol, branch_number):
 215     _SymbolData.__init__(self, id, symbol)
 216
 217     # The branch number (e.g., '1.5.2') of this branch.
 218     self.branch_number = branch_number
 219
 220     # The revision number of the revision from which this branch
 221     # sprouts (e.g., '1.5').
 222     self.parent = self.branch_number[:self.branch_number.rindex(".")]
 223
 224     # The revision number of the first commit on this branch, if any
 225     # (e.g., '1.5.2.1'); otherwise, None.
 226     self.child = None
 227
 228
 229 class _TagData(_SymbolData):
 230   """Collection area for information about a Tag in a single CVSFile."""
 231
 232   def __init__(self, id, symbol, rev):
 233     _SymbolData.__init__(self, id, symbol)
 234
 235     # The revision number being tagged (e.g., '1.5.2.3').
 236     self.rev = rev
 237
 238
 239 class _SymbolDataCollector(object):
 240   """Collect information about symbols in a single CVSFile."""
 241
 242   def __init__(self, fdc, cvs_file):
 243     self.fdc = fdc
 244     self.cvs_file = cvs_file
 245
 246     self.pdc = self.fdc.pdc
 247     self.collect_data = self.fdc.collect_data
 248
 249     # A list [(name, revision), ...] of symbols defined in the header
 250     # of the file.  The name has already been transformed using the
 251     # symbol transform rules.  If the symbol transform rules indicate
 252     # that the symbol should be ignored, then it is never added to
 253     # this list.  This list is processed then deleted in
 254     # process_symbols().
 255     self._symbol_defs = []
 256
 257     # A set containing the transformed names of symbols in this file
 258     # (used to detect duplicates during processing of unlabeled
 259     # branches):
 260     self._defined_symbols = set()
 261
 262     # Map { branch_number : _BranchData }, where branch_number has an
 263     # odd number of digits.
 264     self.branches_data = { }
 265
 266     # Map { revision : [ tag_data ] }, where revision has an even
 267     # number of digits, and the value is a list of _TagData objects
 268     # for tags that apply to that revision.
 269     self.tags_data = { }
 270
 271   def _add_branch(self, name, branch_number):
 272     """Record that BRANCH_NUMBER is the branch number for branch NAME,
 273     and derive and record the revision from which NAME sprouts.
 274     BRANCH_NUMBER is an RCS branch number with an odd number of
 275     components, for example '1.7.2' (never '1.7.0.2').  Return the
 276     _BranchData instance (which is usually newly-created)."""
 277
 278     branch_data = self.branches_data.get(branch_number)
 279
 280     if branch_data is not None:
 281       Log().warn(
 282           "%s: in '%s':\n"
 283           "   branch '%s' already has name '%s',\n"
 284           "   cannot also have name '%s', ignoring the latter\n"
 285           % (warning_prefix,
 286              self.cvs_file.filename, branch_number,
 287              branch_data.symbol.name, name)
 288           )
 289       return branch_data
 290
 291     symbol = self.pdc.get_symbol(name)
 292     branch_data = _BranchData(
 293         self.collect_data.item_key_generator.gen_id(), symbol, branch_number
 294         )
 295     self.branches_data[branch_number] = branch_data
 296     return branch_data
 297
 298   def _construct_distinct_name(self, name, original_name):
 299     """Construct a distinct symbol name from NAME.
 300
 301     If NAME is distinct, return it.  If it is already used in this
 302     file (as determined from its presence in self._defined_symbols),
 303     construct and return a new name that is not already used."""
 304
 305     if name not in self._defined_symbols:
 306       return name
 307     else:
 308       index = 1
 309       while True:
 310         dup_name = '%s-DUPLICATE-%d' % (name, index,)
 311         if dup_name not in self._defined_symbols:
 312           self.collect_data.record_fatal_error(
 313               "Symbol name '%s' is already used in '%s'.\n"
 314               "The unlabeled branch '%s' must be renamed using "
 315               "--symbol-transform."
 316               % (name, self.cvs_file.filename, original_name,)
 317               )
 318           return dup_name
 319
 320   def _add_unlabeled_branch(self, branch_number):
 321     original_name = "unlabeled-" + branch_number
 322     name = self.transform_symbol(original_name, branch_number)
 323     if name is None:
 324       self.collect_data.record_fatal_error(
 325           "The unlabeled branch '%s' in '%s' contains commits.\n"
 326           "It may not be ignored via a symbol transform.  (Use --exclude "
 327           "instead.)"
 328           % (original_name, self.cvs_file.filename,)
 329           )
 330       # Retain the original name to allow the conversion to continue:
 331       name = original_name
 332
 333     distinct_name = self._construct_distinct_name(name, original_name)
 334     self._defined_symbols.add(distinct_name)
 335     return self._add_branch(distinct_name, branch_number)
 336
 337   def _add_tag(self, name, revision):
 338     """Record that tag NAME refers to the specified REVISION."""
 339
 340     symbol = self.pdc.get_symbol(name)
 341     tag_data = _TagData(
 342         self.collect_data.item_key_generator.gen_id(), symbol, revision
 343         )
 344     self.tags_data.setdefault(revision, []).append(tag_data)
 345     return tag_data
 346
 347   def transform_symbol(self, name, revision):
 348     """Transform a symbol according to the project's symbol transforms.
 349
 350     Transform the symbol with the original name NAME and canonicalized
 351     revision number REVISION.  Return the new symbol name or None if
 352     the symbol should be ignored entirely.
 353
 354     Log the results of the symbol transform if necessary."""
 355
 356     old_name = name
 357     # Apply any user-defined symbol transforms to the symbol name:
 358     name = self.cvs_file.project.transform_symbol(
 359         self.cvs_file, name, revision
 360         )
 361
 362     if name is None:
 363       # Ignore symbol:
 364       self.pdc.log_symbol_transform(old_name, None)
 365       Log().verbose(
 366           "   symbol '%s'=%s ignored in %s"
 367           % (old_name, revision, self.cvs_file.filename,)
 368           )
 369     else:
 370       if name != old_name:
 371         self.pdc.log_symbol_transform(old_name, name)
 372         Log().verbose(
 373             "   symbol '%s'=%s transformed to '%s' in %s"
 374             % (old_name, revision, name, self.cvs_file.filename,)
 375             )
 376
 377     return name
 378
 379   def define_symbol(self, name, revision):
 380     """Record a symbol definition for later processing."""
 381
 382     # Canonicalize the revision number:
 383     revision = _branch_revision_re.sub(r'\1\2', revision)
 384
 385     # Apply any user-defined symbol transforms to the symbol name:
 386     name = self.transform_symbol(name, revision)
 387
 388     if name is not None:
 389       # Verify that the revision number is valid:
 390       if _valid_revision_re.match(revision):
 391         # The revision number is valid; record it for later processing:
 392         self._symbol_defs.append( (name, revision) )
 393       else:
 394         Log().warn(
 395             'In %r:\n'
 396             '    branch %r references invalid revision %s\n'
 397             '    and will be ignored.'
 398             % (self.cvs_file.filename, name, revision,)
 399             )
 400
 401   def _eliminate_trivial_duplicate_defs(self, symbol_defs):
 402     """Iterate through SYMBOL_DEFS, Removing identical duplicate definitions.
 403
 404     Duplicate definitions of symbol names have been seen in the wild,
 405     and they can also happen when --symbol-transform is used.  If a
 406     symbol is defined to the same revision number repeatedly, then
 407     ignore all but the last definition."""
 408
 409     # Make a copy, since we have to iterate through the definitions
 410     # twice:
 411     symbol_defs = list(symbol_defs)
 412
 413     # A map { (name, revision) : [index,...] } of the indexes where
 414     # symbol definitions name=revision were found:
 415     known_definitions = {}
 416     for (i, symbol_def) in enumerate(symbol_defs):
 417       known_definitions.setdefault(symbol_def, []).append(i)
 418
 419     # A set of the indexes of entries that have to be removed from
 420     # symbol_defs:
 421     dup_indexes = set()
 422     for ((name, revision), indexes) in known_definitions.iteritems():
 423       if len(indexes) > 1:
 424         Log().verbose(
 425             "in %r:\n"
 426             "   symbol %s:%s defined multiple times; ignoring duplicates\n"
 427             % (self.cvs_file.filename, name, revision,)
 428             )
 429         dup_indexes.update(indexes[:-1])
 430
 431     for (i, symbol_def) in enumerate(symbol_defs):
 432       if i not in dup_indexes:
 433         yield symbol_def
 434
 435   def _process_duplicate_defs(self, symbol_defs):
 436     """Iterate through SYMBOL_DEFS, processing duplicate names.
 437
 438     Duplicate definitions of symbol names have been seen in the wild,
 439     and they can also happen when --symbol-transform is used.  If a
 440     symbol is defined multiple times, then it is a fatal error.  This
 441     method should be called after _eliminate_trivial_duplicate_defs()."""
 442
 443     # Make a copy, since we have to access multiple times:
 444     symbol_defs = list(symbol_defs)
 445
 446     # A map {name : [index,...]} mapping the names of symbols to a
 447     # list of their definitions' indexes in symbol_defs:
 448     known_symbols = {}
 449     for (i, (name, revision)) in enumerate(symbol_defs):
 450       known_symbols.setdefault(name, []).append(i)
 451
 452     known_symbols = known_symbols.items()
 453     known_symbols.sort()
 454     dup_indexes = set()
 455     for (name, indexes) in known_symbols:
 456       if len(indexes) > 1:
 457         # This symbol was defined multiple times.
 458         self.collect_data.record_fatal_error(
 459             "Multiple definitions of the symbol '%s' in '%s': %s" % (
 460                 name, self.cvs_file.filename,
 461                 ' '.join([symbol_defs[i][1] for i in indexes]),
 462                 )
 463             )
 464         # Ignore all but the last definition for now, to allow the
 465         # conversion to proceed:
 466         dup_indexes.update(indexes[:-1])
 467
 468     for (i, symbol_def) in enumerate(symbol_defs):
 469       if i not in dup_indexes:
 470         yield symbol_def
 471
 472   def _process_symbol(self, name, revision):
 473     """Process a symbol called NAME, which is associated with REVISON.
 474
 475     REVISION is a canonical revision number with zeros removed, for
 476     example: '1.7', '1.7.2', or '1.1.1' or '1.1.1.1'.  NAME is a
 477     transformed branch or tag name."""
 478
 479     # Add symbol to our records:
 480     if is_branch_revision_number(revision):
 481       self._add_branch(name, revision)
 482     else:
 483       self._add_tag(name, revision)
 484
 485   def process_symbols(self):
 486     """Process the symbol definitions from SELF._symbol_defs."""
 487
 488     symbol_defs = self._symbol_defs
 489     del self._symbol_defs
 490
 491     symbol_defs = self._eliminate_trivial_duplicate_defs(symbol_defs)
 492     symbol_defs = self._process_duplicate_defs(symbol_defs)
 493
 494     for (name, revision) in symbol_defs:
 495       self._defined_symbols.add(name)
 496       self._process_symbol(name, revision)
 497
 498   @staticmethod
 499   def rev_to_branch_number(revision):
 500     """Return the branch_number of the branch on which REVISION lies.
 501
 502     REVISION is a branch revision number with an even number of
 503     components; for example '1.7.2.1' (never '1.7.2' nor '1.7.0.2').
 504     The return value is the branch number (for example, '1.7.2').
 505     Return none iff REVISION is a trunk revision such as '1.2'."""
 506
 507     if is_trunk_revision(revision):
 508       return None
 509     return revision[:revision.rindex(".")]
 510
 511   def rev_to_branch_data(self, revision):
 512     """Return the branch_data of the branch on which REVISION lies.
 513
 514     REVISION must be a branch revision number with an even number of
 515     components; for example '1.7.2.1' (never '1.7.2' nor '1.7.0.2').
 516     Raise KeyError iff REVISION is unknown."""
 517
 518     assert not is_trunk_revision(revision)
 519
 520     return self.branches_data[self.rev_to_branch_number(revision)]
 521
 522   def rev_to_lod(self, revision):
 523     """Return the line of development on which REVISION lies.
 524
 525     REVISION must be a revision number with an even number of
 526     components.  Raise KeyError iff REVISION is unknown."""
 527
 528     if is_trunk_revision(revision):
 529       return self.pdc.trunk
 530     else:
 531       return self.rev_to_branch_data(revision).symbol
 532
 533
 534 class _FileDataCollector(cvs2svn_rcsparse.Sink):
 535   """Class responsible for collecting RCS data for a particular file.
 536
 537   Any collected data that need to be remembered are stored into the
 538   referenced CollectData instance."""
 539
 540   def __init__(self, pdc, cvs_file):
 541     """Create an object that is prepared to receive data for CVS_FILE.
 542     CVS_FILE is a CVSFile instance.  COLLECT_DATA is used to store the
 543     information collected about the file."""
 544
 545     self.pdc = pdc
 546     self.cvs_file = cvs_file
 547
 548     self.collect_data = self.pdc.collect_data
 549     self.project = self.cvs_file.project
 550
 551     # A place to store information about the symbols in this file:
 552     self.sdc = _SymbolDataCollector(self, self.cvs_file)
 553
 554     # { revision : _RevisionData instance }
 555     self._rev_data = { }
 556
 557     # Lists [ (parent, child) ] of revision number pairs indicating
 558     # that revision child depends on revision parent along the main
 559     # line of development.
 560     self._primary_dependencies = []
 561
 562     # If set, this is an RCS branch number -- rcsparse calls this the
 563     # "principal branch", but CVS and RCS refer to it as the "default
 564     # branch", so that's what we call it, even though the rcsparse API
 565     # setter method is still 'set_principal_branch'.
 566     self.default_branch = None
 567
 568     # True iff revision 1.1 of the file appears to have been imported
 569     # (as opposed to added normally).
 570     self._file_imported = False
 571
 572   def _get_rev_id(self, revision):
 573     if revision is None:
 574       return None
 575     return self._rev_data[revision].cvs_rev_id
 576
 577   def set_principal_branch(self, branch):
 578     """This is a callback method declared in Sink."""
 579
 580     if branch.find('.') == -1:
 581       # This just sets the default branch to trunk.  Normally this
 582       # shouldn't occur, but it has been seen in at least one CVS
 583       # repository.  Just ignore it.
 584       pass
 585     else:
 586       self.default_branch = branch
 587
 588   def set_expansion(self, mode):
 589     """This is a callback method declared in Sink."""
 590
 591     self.cvs_file.mode = mode
 592
 593   def set_description(self, description):
 594     """This is a callback method declared in Sink."""
 595
 596     self.cvs_file.description = description
 597
 598   def define_tag(self, name, revision):
 599     """Remember the symbol name and revision, but don't process them yet.
 600
 601     This is a callback method declared in Sink."""
 602
 603     self.sdc.define_symbol(name, revision)
 604
 605   def admin_completed(self):
 606     """This is a callback method declared in Sink."""
 607
 608     self.sdc.process_symbols()
 609
 610   def define_revision(self, revision, timestamp, author, state,
 611                       branches, next):
 612     """This is a callback method declared in Sink."""
 613
 614     for branch in branches:
 615       try:
 616         branch_data = self.sdc.rev_to_branch_data(branch)
 617       except KeyError:
 618         # Normally we learn about the branches from the branch names
 619         # and numbers parsed from the symbolic name header.  But this
 620         # must have been an unlabeled branch that slipped through the
 621         # net.  Generate a name for it and create a _BranchData record
 622         # for it now.
 623         branch_data = self.sdc._add_unlabeled_branch(
 624             self.sdc.rev_to_branch_number(branch))
 625
 626       assert branch_data.child is None
 627       branch_data.child = branch
 628
 629     if revision in self._rev_data:
 630       # This revision has already been seen.
 631       Log().error('File %r contains duplicate definitions of revision %s.'
 632                   % (self.cvs_file.filename, revision,))
 633       raise RuntimeError
 634
 635     # Record basic information about the revision:
 636     rev_data = _RevisionData(
 637         self.collect_data.item_key_generator.gen_id(),
 638         revision, int(timestamp), author, state)
 639     self._rev_data[revision] = rev_data
 640
 641     # When on trunk, the RCS 'next' revision number points to what
 642     # humans might consider to be the 'previous' revision number.  For
 643     # example, 1.3's RCS 'next' is 1.2.
 644     #
 645     # However, on a branch, the RCS 'next' revision number really does
 646     # point to what humans would consider to be the 'next' revision
 647     # number.  For example, 1.1.2.1's RCS 'next' would be 1.1.2.2.
 648     #
 649     # In other words, in RCS, 'next' always means "where to find the next
 650     # deltatext that you need this revision to retrieve.
 651     #
 652     # That said, we don't *want* RCS's behavior here, so we determine
 653     # whether we're on trunk or a branch and set the dependencies
 654     # accordingly.
 655     if next:
 656       if is_trunk_revision(revision):
 657         self._primary_dependencies.append( (next, revision,) )
 658       else:
 659         self._primary_dependencies.append( (revision, next,) )
 660
 661   def _resolve_primary_dependencies(self):
 662     """Resolve the dependencies listed in self._primary_dependencies."""
 663
 664     for (parent, child,) in self._primary_dependencies:
 665       parent_data = self._rev_data[parent]
 666       assert parent_data.child is None
 667       parent_data.child = child
 668
 669       child_data = self._rev_data[child]
 670       assert child_data.parent is None
 671       child_data.parent = parent
 672
 673   def _resolve_branch_dependencies(self):
 674     """Resolve dependencies involving branches."""
 675
 676     for branch_data in self.sdc.branches_data.values():
 677       # The branch_data's parent has the branch as a child regardless
 678       # of whether the branch had any subsequent commits:
 679       try:
 680         parent_data = self._rev_data[branch_data.parent]
 681       except KeyError:
 682         Log().warn(
 683             'In %r:\n'
 684             '    branch %r references non-existing revision %s\n'
 685             '    and will be ignored.'
 686             % (self.cvs_file.filename, branch_data.symbol.name,
 687                branch_data.parent,))
 688         del self.sdc.branches_data[branch_data.branch_number]
 689       else:
 690         parent_data.branches_data.append(branch_data)
 691
 692         # If the branch has a child (i.e., something was committed on
 693         # the branch), then we store a reference to the branch_data
 694         # there, define the child's parent to be the branch's parent,
 695         # and list the child in the branch parent's branches_revs_data:
 696         if branch_data.child is not None:
 697           child_data = self._rev_data[branch_data.child]
 698           assert child_data.parent_branch_data is None
 699           child_data.parent_branch_data = branch_data
 700           assert child_data.parent is None
 701           child_data.parent = branch_data.parent
 702           parent_data.branches_revs_data.append(branch_data.child)
 703
 704   def _sort_branches(self):
 705     """Sort the branches sprouting from each revision in creation order.
 706
 707     Creation order is taken to be the reverse of the order that they
 708     are listed in the symbols part of the RCS file.  (If a branch is
 709     created then deleted, a later branch can be assigned the recycled
 710     branch number; therefore branch numbers are not an indication of
 711     creation order.)"""
 712
 713     for rev_data in self._rev_data.values():
 714       rev_data.branches_data.sort(lambda a, b: - cmp(a.id, b.id))
 715
 716   def _resolve_tag_dependencies(self):
 717     """Resolve dependencies involving tags."""
 718
 719     for (rev, tag_data_list) in self.sdc.tags_data.items():
 720       try:
 721         parent_data = self._rev_data[rev]
 722       except KeyError:
 723         Log().warn(
 724             'In %r:\n'
 725             '    the following tag(s) reference non-existing revision %s\n'
 726             '    and will be ignored:\n'
 727             '    %s' % (
 728                 self.cvs_file.filename, rev,
 729                 ', '.join([repr(tag_data.symbol.name)
 730                            for tag_data in tag_data_list]),))
 731         del self.sdc.tags_data[rev]
 732       else:
 733         for tag_data in tag_data_list:
 734           assert tag_data.rev == rev
 735           # The tag_data's rev has the tag as a child:
 736           parent_data.tags_data.append(tag_data)
 737
 738   def _determine_operation(self, rev_data):
 739     prev_rev_data = self._rev_data.get(rev_data.parent)
 740     return cvs_revision_type_map[(
 741         rev_data.state != 'dead',
 742         prev_rev_data is not None and prev_rev_data.state != 'dead',
 743         )]
 744
 745   def _get_cvs_revision(self, rev_data):
 746     """Create and return a CVSRevision for REV_DATA."""
 747
 748     branch_ids = [
 749         branch_data.id
 750         for branch_data in rev_data.branches_data
 751         ]
 752
 753     branch_commit_ids = [
 754         self._get_rev_id(rev)
 755         for rev in rev_data.branches_revs_data
 756         ]
 757
 758     tag_ids = [
 759         tag_data.id
 760         for tag_data in rev_data.tags_data
 761         ]
 762
 763     revision_type = self._determine_operation(rev_data)
 764
 765     return revision_type(
 766         self._get_rev_id(rev_data.rev), self.cvs_file,
 767         rev_data.timestamp, None,
 768         self._get_rev_id(rev_data.parent),
 769         self._get_rev_id(rev_data.child),
 770         rev_data.rev,
 771         True,
 772         self.sdc.rev_to_lod(rev_data.rev),
 773         rev_data.get_first_on_branch_id(),
 774         False, None, None,
 775         tag_ids, branch_ids, branch_commit_ids,
 776         rev_data.revision_recorder_token)
 777
 778   def _get_cvs_revisions(self):
 779     """Generate the CVSRevisions present in this file."""
 780
 781     for rev_data in self._rev_data.itervalues():
 782       yield self._get_cvs_revision(rev_data)
 783
 784   def _get_cvs_branches(self):
 785     """Generate the CVSBranches present in this file."""
 786
 787     for branch_data in self.sdc.branches_data.values():
 788       yield CVSBranch(
 789           branch_data.id, self.cvs_file, branch_data.symbol,
 790           branch_data.branch_number,
 791           self.sdc.rev_to_lod(branch_data.parent),
 792           self._get_rev_id(branch_data.parent),
 793           self._get_rev_id(branch_data.child),
 794           None,
 795           )
 796
 797   def _get_cvs_tags(self):
 798     """Generate the CVSTags present in this file."""
 799
 800     for tags_data in self.sdc.tags_data.values():
 801       for tag_data in tags_data:
 802         yield CVSTag(
 803             tag_data.id, self.cvs_file, tag_data.symbol,
 804             self.sdc.rev_to_lod(tag_data.rev),
 805             self._get_rev_id(tag_data.rev),
 806             None,
 807             )
 808
 809   def tree_completed(self):
 810     """The revision tree has been parsed.
 811
 812     Analyze it for consistency and connect some loose ends.
 813
 814     This is a callback method declared in Sink."""
 815
 816     self._resolve_primary_dependencies()
 817     self._resolve_branch_dependencies()
 818     self._sort_branches()
 819     self._resolve_tag_dependencies()
 820
 821     # Compute the preliminary CVSFileItems for this file:
 822     cvs_items = []
 823     cvs_items.extend(self._get_cvs_revisions())
 824     cvs_items.extend(self._get_cvs_branches())
 825     cvs_items.extend(self._get_cvs_tags())
 826     self._cvs_file_items = CVSFileItems(
 827         self.cvs_file, self.pdc.trunk, cvs_items
 828         )
 829
 830     self._cvs_file_items.check_link_consistency()
 831
 832     # Warm up the revision recorder:
 833     self.collect_data.revision_recorder.start_file(self.cvs_file)
 834
 835   def set_revision_info(self, revision, log, text):
 836     """This is a callback method declared in Sink."""
 837
 838     rev_data = self._rev_data[revision]
 839     cvs_rev = self._cvs_file_items[rev_data.cvs_rev_id]
 840
 841     if cvs_rev.metadata_id is not None:
 842       # Users have reported problems with repositories in which the
 843       # deltatext block for revision 1.1 appears twice.  It is not
 844       # known whether this results from a CVS/RCS bug, or from botched
 845       # hand-editing of the repository.  In any case, empirically, cvs
 846       # and rcs both use the first version when checking out data, so
 847       # that's what we will do.  (For the record: "cvs log" fails on
 848       # such a file; "rlog" prints the log message from the first
 849       # block and ignores the second one.)
 850       Log().warn(
 851           "%s: in '%s':\n"
 852           "   Deltatext block for revision %s appeared twice;\n"
 853           "   ignoring the second occurrence.\n"
 854           % (warning_prefix, self.cvs_file.filename, revision,)
 855           )
 856       return
 857
 858     if is_trunk_revision(revision):
 859       branch_name = None
 860     else:
 861       branch_name = self.sdc.rev_to_branch_data(revision).symbol.name
 862
 863     cvs_rev.metadata_id = self.collect_data.metadata_logger.store(
 864         self.project, branch_name, rev_data.author, log
 865         )
 866     cvs_rev.deltatext_exists = bool(text)
 867
 868     # If this is revision 1.1, determine whether the file appears to
 869     # have been created via 'cvs add' instead of 'cvs import'.  The
 870     # test is that the log message CVS uses for 1.1 in imports is
 871     # "Initial revision\n" with no period.  (This fact helps determine
 872     # whether this file might have had a default branch in the past.)
 873     if revision == '1.1':
 874       self._file_imported = (log == 'Initial revision\n')
 875
 876     cvs_rev.revision_recorder_token = \
 877         self.collect_data.revision_recorder.record_text(cvs_rev, log, text)
 878
 879   def parse_completed(self):
 880     """Finish the processing of this file.
 881
 882     This is a callback method declared in Sink."""
 883
 884     # Make sure that there was an info section for each revision:
 885     for cvs_item in self._cvs_file_items.values():
 886       if isinstance(cvs_item, CVSRevision) and cvs_item.metadata_id is None:
 887         self.collect_data.record_fatal_error(
 888             '%r has no deltatext section for revision %s'
 889             % (self.cvs_file.filename, cvs_item.rev,)
 890             )
 891
 892   def _process_ntdbrs(self):
 893     """Fix up any non-trunk default branch revisions (if present).
 894
 895     If a non-trunk default branch is determined to have existed, yield
 896     the _RevisionData.ids for all revisions that were once non-trunk
 897     default revisions, in dependency order.
 898
 899     There are two cases to handle:
 900
 901     One case is simple.  The RCS file lists a default branch
 902     explicitly in its header, such as '1.1.1'.  In this case, we know
 903     that every revision on the vendor branch is to be treated as head
 904     of trunk at that point in time.
 905
 906     But there's also a degenerate case.  The RCS file does not
 907     currently have a default branch, yet we can deduce that for some
 908     period in the past it probably *did* have one.  For example, the
 909     file has vendor revisions 1.1.1.1 -> 1.1.1.96, all of which are
 910     dated before 1.2, and then it has 1.1.1.97 -> 1.1.1.100 dated
 911     after 1.2.  In this case, we should record 1.1.1.96 as the last
 912     vendor revision to have been the head of the default branch.
 913
 914     If any non-trunk default branch revisions are found:
 915
 916     - Set their ntdbr members to True.
 917
 918     - Connect the last one with revision 1.2.
 919
 920     - Remove revision 1.1 if it is not needed.
 921
 922     """
 923
 924     try:
 925       if self.default_branch:
 926         vendor_cvs_branch_id = self.sdc.branches_data[self.default_branch].id
 927         vendor_lod_items = self._cvs_file_items.get_lod_items(
 928             self._cvs_file_items[vendor_cvs_branch_id]
 929             )
 930         if not self._cvs_file_items.process_live_ntdb(vendor_lod_items):
 931           return
 932       elif self._file_imported:
 933         vendor_branch_data = self.sdc.branches_data.get('1.1.1')
 934         if vendor_branch_data is None:
 935           return
 936         else:
 937           vendor_lod_items = self._cvs_file_items.get_lod_items(
 938               self._cvs_file_items[vendor_branch_data.id]
 939               )
 940           if not self._cvs_file_items.process_historical_ntdb(
 941                 vendor_lod_items
 942                 ):
 943             return
 944       else:
 945         return
 946     except VendorBranchError, e:
 947       self.collect_data.record_fatal_error(str(e))
 948       return
 949
 950     if self._file_imported:
 951       self._cvs_file_items.imported_remove_1_1(vendor_lod_items)
 952
 953     self._cvs_file_items.check_link_consistency()
 954
 955   def get_cvs_file_items(self):
 956     """Finish up and return a CVSFileItems instance for this file.
 957
 958     This method must only be called once."""
 959
 960     self._process_ntdbrs()
 961
 962     # Break a circular reference loop, allowing the memory for self
 963     # and sdc to be freed.
 964     del self.sdc
 965
 966     return self._cvs_file_items
 967
 968
 969 class _ProjectDataCollector:
 970   def __init__(self, collect_data, project):
 971     self.collect_data = collect_data
 972     self.project = project
 973     self.num_files = 0
 974
 975     # The Trunk LineOfDevelopment object for this project:
 976     self.trunk = Trunk(
 977         self.collect_data.symbol_key_generator.gen_id(), self.project
 978         )
 979     self.project.trunk_id = self.trunk.id
 980
 981     # This causes a record for self.trunk to spring into existence:
 982     self.collect_data.register_trunk(self.trunk)
 983
 984     # A map { name -> Symbol } for all known symbols in this project.
 985     # The symbols listed here are undifferentiated into Branches and
 986     # Tags because the same name might appear as a branch in one file
 987     # and a tag in another.
 988     self.symbols = {}
 989
 990     # A map { (old_name, new_name) : count } indicating how many files
 991     # were affected by each each symbol name transformation:
 992     self.symbol_transform_counts = {}
 993
 994   def get_symbol(self, name):
 995     """Return the Symbol object for the symbol named NAME in this project.
 996
 997     If such a symbol does not yet exist, allocate a new symbol_id,
 998     create a Symbol instance, store it in self.symbols, and return it."""
 999
1000     symbol = self.symbols.get(name)
1001     if symbol is None:
1002       symbol = Symbol(
1003           self.collect_data.symbol_key_generator.gen_id(),
1004           self.project, name)
1005       self.symbols[name] = symbol
1006     return symbol
1007
1008   def log_symbol_transform(self, old_name, new_name):
1009     """Record that OLD_NAME was transformed to NEW_NAME in one file.
1010
1011     This information is used to generated a statistical summary of
1012     symbol transforms."""
1013
1014     try:
1015       self.symbol_transform_counts[old_name, new_name] += 1
1016     except KeyError:
1017       self.symbol_transform_counts[old_name, new_name] = 1
1018
1019   def summarize_symbol_transforms(self):
1020     if self.symbol_transform_counts and Log().is_on(Log.NORMAL):
1021       log = Log()
1022       log.normal('Summary of symbol transforms:')
1023       transforms = self.symbol_transform_counts.items()
1024       transforms.sort()
1025       for ((old_name, new_name), count) in transforms:
1026         if new_name is None:
1027           log.normal('    "%s" ignored in %d files' % (old_name, count,))
1028         else:
1029           log.normal(
1030               '    "%s" transformed to "%s" in %d files'
1031               % (old_name, new_name, count,)
1032               )
1033
1034   def process_file(self, cvs_file):
1035     Log().normal(cvs_file.filename)
1036     fdc = _FileDataCollector(self, cvs_file)
1037     try:
1038       cvs2svn_rcsparse.parse(open(cvs_file.filename, 'rb'), fdc)
1039     except (cvs2svn_rcsparse.common.RCSParseError, ValueError, RuntimeError):
1040       self.collect_data.record_fatal_error(
1041           "%r is not a valid ,v file" % (cvs_file.filename,)
1042           )
1043       # Abort the processing of this file, but let the pass continue
1044       # with other files:
1045       return
1046     except:
1047       Log().warn("Exception occurred while parsing %s" % cvs_file.filename)
1048       raise
1049     else:
1050       self.num_files += 1
1051
1052     return fdc.get_cvs_file_items()
1053
1054
1055 class CollectData:
1056   """Repository for data collected by parsing the CVS repository files.
1057
1058   This class manages the databases into which information collected
1059   from the CVS repository is stored.  The data are stored into this
1060   class by _FileDataCollector instances, one of which is created for
1061   each file to be parsed."""
1062
1063   def __init__(self, revision_recorder, stats_keeper):
1064     self.revision_recorder = revision_recorder
1065     self._cvs_item_store = NewCVSItemStore(
1066         artifact_manager.get_temp_file(config.CVS_ITEMS_STORE))
1067     self.metadata_db = MetadataDatabase(
1068         artifact_manager.get_temp_file(config.METADATA_STORE),
1069         artifact_manager.get_temp_file(config.METADATA_INDEX_TABLE),
1070         DB_OPEN_NEW,
1071         )
1072     self.metadata_logger = MetadataLogger(self.metadata_db)
1073     self.fatal_errors = []
1074     self.num_files = 0
1075     self.symbol_stats = SymbolStatisticsCollector()
1076     self.stats_keeper = stats_keeper
1077
1078     # Key generator for CVSFiles:
1079     self.file_key_generator = KeyGenerator()
1080
1081     # Key generator for CVSItems:
1082     self.item_key_generator = KeyGenerator()
1083
1084     # Key generator for Symbols:
1085     self.symbol_key_generator = KeyGenerator()
1086
1087     self.revision_recorder.start()
1088
1089   def record_fatal_error(self, err):
1090     """Record that fatal error ERR was found.
1091
1092     ERR is a string (without trailing newline) describing the error.
1093     Output the error to stderr immediately, and record a copy to be
1094     output again in a summary at the end of CollectRevsPass."""
1095
1096     err = '%s: %s' % (error_prefix, err,)
1097     Log().error(err + '\n')
1098     self.fatal_errors.append(err)
1099
1100   def add_cvs_directory(self, cvs_directory):
1101     """Record CVS_DIRECTORY."""
1102
1103     Ctx()._cvs_path_db.log_path(cvs_directory)
1104
1105   def add_cvs_file_items(self, cvs_file_items):
1106     """Record the information from CVS_FILE_ITEMS.
1107
1108     Store the CVSFile to _cvs_path_db under its persistent id, store
1109     the CVSItems, and record the CVSItems to self.stats_keeper."""
1110
1111     Ctx()._cvs_path_db.log_path(cvs_file_items.cvs_file)
1112     self._cvs_item_store.add(cvs_file_items)
1113
1114     self.stats_keeper.record_cvs_file(cvs_file_items.cvs_file)
1115     for cvs_item in cvs_file_items.values():
1116       self.stats_keeper.record_cvs_item(cvs_item)
1117
1118   def register_trunk(self, trunk):
1119     """Create a symbol statistics record for the specified trunk LOD."""
1120
1121     # This causes a record to spring into existence:
1122     self.symbol_stats[trunk]
1123
1124   def _process_cvs_file_items(self, cvs_file_items):
1125     """Process the CVSFileItems from one CVSFile."""
1126
1127     # Remove an initial delete on trunk if it is not needed:
1128     cvs_file_items.remove_unneeded_initial_trunk_delete(self.metadata_db)
1129
1130     # Remove initial branch deletes that are not needed:
1131     cvs_file_items.remove_initial_branch_deletes(self.metadata_db)
1132
1133     # If this is a --trunk-only conversion, discard all branches and
1134     # tags, then draft any non-trunk default branch revisions to
1135     # trunk:
1136     if Ctx().trunk_only:
1137       cvs_file_items.exclude_non_trunk()
1138
1139     cvs_file_items.check_link_consistency()
1140
1141     self.revision_recorder.finish_file(cvs_file_items)
1142     self.add_cvs_file_items(cvs_file_items)
1143     self.symbol_stats.register(cvs_file_items)
1144
1145   def process_project(self, project):
1146     Ctx()._projects[project.id] = project
1147
1148     pdc = _ProjectDataCollector(self, project)
1149
1150     found_rcs_file = False
1151     for cvs_path in walk_repository(
1152           project, self.file_key_generator, self.record_fatal_error
1153           ):
1154       if isinstance(cvs_path, CVSDirectory):
1155         self.add_cvs_directory(cvs_path)
1156       else:
1157         cvs_file_items = pdc.process_file(cvs_path)
1158         self._process_cvs_file_items(cvs_file_items)
1159         found_rcs_file = True
1160
1161     if not found_rcs_file:
1162       self.record_fatal_error(
1163           'No RCS files found under %r!\n'
1164           'Are you absolutely certain you are pointing cvs2svn\n'
1165           'at a CVS repository?\n'
1166           % (project.project_cvs_repos_path,)
1167           )
1168
1169     pdc.summarize_symbol_transforms()
1170
1171     self.num_files += pdc.num_files
1172     Log().verbose('Processed', self.num_files, 'files')
1173
1174   def _register_empty_subdirectories(self):
1175     """Set the CVSDirectory.empty_subdirectory_id members."""
1176
1177     directories = set(
1178         path
1179         for path in Ctx()._cvs_path_db.itervalues()
1180         if isinstance(path, CVSDirectory)
1181         )
1182     for path in Ctx()._cvs_path_db.itervalues():
1183       if isinstance(path, CVSFile):
1184         directory = path.parent_directory
1185         while directory is not None and directory in directories:
1186           directories.remove(directory)
1187           directory = directory.parent_directory
1188     for directory in directories:
1189       if directory.parent_directory is not None:
1190         directory.parent_directory.empty_subdirectory_ids.append(directory.id)
1191
1192   def _set_cvs_path_ordinals(self):
1193     cvs_files = list(Ctx()._cvs_path_db.itervalues())
1194     cvs_files.sort(CVSPath.slow_compare)
1195     for (i, cvs_file) in enumerate(cvs_files):
1196       cvs_file.ordinal = i
1197
1198   def close(self):
1199     """Close the data structures associated with this instance.
1200
1201     Return a list of fatal errors encountered while processing input.
1202     Each list entry is a string describing one fatal error."""
1203
1204     self.revision_recorder.finish()
1205     self.symbol_stats.purge_ghost_symbols()
1206     self.symbol_stats.close()
1207     self.symbol_stats = None
1208     self.metadata_logger = None
1209     self.metadata_db.close()
1210     self.metadata_db = None
1211     self._cvs_item_store.close()
1212     self._cvs_item_store = None
1213     self._register_empty_subdirectories()
1214     self._set_cvs_path_ordinals()
1215     self.revision_recorder = None
1216     retval = self.fatal_errors
1217     self.fatal_errors = None
1218     return retval
1219
1220