cvs2svn_lib/collect_data.py

   1 # (Be in -*- python -*- mode.)
   2 #
   3 # ====================================================================
   4 # Copyright (c) 2000-2009 CollabNet.  All rights reserved.
   5 #
   6 # This software is licensed as described in the file COPYING, which
   7 # you should have received as part of this distribution.  The terms
   8 # are also available at http://subversion.tigris.org/license-1.html.
   9 # If newer versions of this license are posted there, you may use a
  10 # newer version instead, at your option.
  11 #
  12 # This software consists of voluntary contributions made by many
  13 # individuals.  For exact contribution history, see the revision
  14 # history and logs, available at http://cvs2svn.tigris.org/.
  15 # ====================================================================
  16
  17 """Data collection classes.
  18
  19 This module contains the code used to collect data from the CVS
  20 repository.  It parses *,v files, recording all useful information
  21 except for the actual file contents.
  22
  23 As a *,v file is parsed, the information pertaining to the file is
  24 accumulated in memory, mostly in _RevisionData, _BranchData, and
  25 _TagData objects.  When parsing is complete, a final pass is made over
  26 the data to create some final dependency links, collect statistics,
  27 etc., then the _*Data objects are converted into CVSItem objects
  28 (CVSRevision, CVSBranch, and CVSTag respectively) and the CVSItems are
  29 dumped into databases.
  30
  31 During the data collection, persistent unique ids are allocated to
  32 many types of objects: CVSFile, Symbol, and CVSItems.  CVSItems are a
  33 special case.  CVSItem ids are unique across all CVSItem types, and
  34 the ids are carried over from the corresponding data collection
  35 objects:
  36
  37     _RevisionData -> CVSRevision
  38
  39     _BranchData -> CVSBranch
  40
  41     _TagData -> CVSTag
  42
  43 In a later pass it is possible to convert tags <-> branches.  But even
  44 if this occurs, the new branch or tag uses the same id as the old tag
  45 or branch.
  46
  47 """
  48
  49
  50 import os
  51 import stat
  52 import re
  53
  54 from cvs2svn_lib import config
  55 from cvs2svn_lib.common import DB_OPEN_NEW
  56 from cvs2svn_lib.common import warning_prefix
  57 from cvs2svn_lib.common import error_prefix
  58 from cvs2svn_lib.common import is_trunk_revision
  59 from cvs2svn_lib.log import Log
  60 from cvs2svn_lib.context import Ctx
  61 from cvs2svn_lib.artifact_manager import artifact_manager
  62 from cvs2svn_lib.cvs_path import CVSPath
  63 from cvs2svn_lib.cvs_path import CVSFile
  64 from cvs2svn_lib.cvs_path import CVSDirectory
  65 from cvs2svn_lib.symbol import Symbol
  66 from cvs2svn_lib.symbol import Trunk
  67 from cvs2svn_lib.cvs_item import CVSRevision
  68 from cvs2svn_lib.cvs_item import CVSBranch
  69 from cvs2svn_lib.cvs_item import CVSTag
  70 from cvs2svn_lib.cvs_item import cvs_revision_type_map
  71 from cvs2svn_lib.cvs_file_items import VendorBranchError
  72 from cvs2svn_lib.cvs_file_items import CVSFileItems
  73 from cvs2svn_lib.key_generator import KeyGenerator
  74 from cvs2svn_lib.cvs_item_database import NewCVSItemStore
  75 from cvs2svn_lib.symbol_statistics import SymbolStatisticsCollector
  76 from cvs2svn_lib.metadata_database import MetadataDatabase
  77 from cvs2svn_lib.metadata_database import MetadataLogger
  78 from cvs2svn_lib.repository_walker import walk_repository
  79
  80 import cvs2svn_rcsparse
  81
  82
  83 # A regular expression defining "valid" revision numbers (used to
  84 # check that symbol definitions are reasonable).
  85 _valid_revision_re = re.compile(r'''
  86     ^
  87     (?:\d+\.)+          # Digit groups with trailing dots
  88     \d+                 # And the last digit group.
  89     $
  90     ''', re.VERBOSE)
  91
  92 _branch_revision_re = re.compile(r'''
  93     ^
  94     ((?:\d+\.\d+\.)+)   # A nonzero even number of digit groups w/trailing dot
  95     (?:0\.)?            # CVS sticks an extra 0 here; RCS does not
  96     (\d+)               # And the last digit group
  97     $
  98     ''', re.VERBOSE)
  99
 100
 101 def is_branch_revision_number(rev):
 102   """Return True iff REV is a branch revision number.
 103
 104   REV is a CVS revision number in canonical form (i.e., with zeros
 105   removed).  Return True iff it refers to a whole branch, as opposed
 106   to a single revision."""
 107
 108   return rev.count('.') % 2 == 0
 109
 110
 111 def is_same_line_of_development(rev1, rev2):
 112   """Return True if rev1 and rev2 are on the same line of
 113   development (i.e., both on trunk, or both on the same branch);
 114   return False otherwise.  Either rev1 or rev2 can be None, in
 115   which case automatically return False."""
 116
 117   if rev1 is None or rev2 is None:
 118     return False
 119   if rev1.count('.') == 1 and rev2.count('.') == 1:
 120     return True
 121   if rev1[0:rev1.rfind('.')] == rev2[0:rev2.rfind('.')]:
 122     return True
 123   return False
 124
 125
 126 class _RevisionData:
 127   """We track the state of each revision so that in set_revision_info,
 128   we can determine if our op is an add/change/delete.  We can do this
 129   because in set_revision_info, we'll have all of the _RevisionData
 130   for a file at our fingertips, and we need to examine the state of
 131   our prev_rev to determine if we're an add or a change.  Without the
 132   state of the prev_rev, we are unable to distinguish between an add
 133   and a change."""
 134
 135   def __init__(self, cvs_rev_id, rev, timestamp, author, state):
 136     # The id of this revision:
 137     self.cvs_rev_id = cvs_rev_id
 138     self.rev = rev
 139     self.timestamp = timestamp
 140     self.author = author
 141     self.original_timestamp = timestamp
 142     self.state = state
 143
 144     # If this is the first revision on a branch, then this is the
 145     # branch_data of that branch; otherwise it is None.
 146     self.parent_branch_data = None
 147
 148     # The revision number of the parent of this revision along the
 149     # same line of development, if any.  For the first revision R on a
 150     # branch, we consider the revision from which R sprouted to be the
 151     # 'parent'.  If this is the root revision in the file's revision
 152     # tree, then this field is None.
 153     #
 154     # Note that this revision can't be determined arithmetically (due
 155     # to cvsadmin -o), which is why this field is necessary.
 156     self.parent = None
 157
 158     # The revision number of the primary child of this revision (the
 159     # child along the same line of development), if any; otherwise,
 160     # None.
 161     self.child = None
 162
 163     # The _BranchData instances of branches that sprout from this
 164     # revision, sorted in ascending order by branch number.  It would
 165     # be inconvenient to initialize it here because we would have to
 166     # scan through all branches known by the _SymbolDataCollector to
 167     # find the ones having us as the parent.  Instead, this
 168     # information is filled in by
 169     # _FileDataCollector._resolve_dependencies() and sorted by
 170     # _FileDataCollector._sort_branches().
 171     self.branches_data = []
 172
 173     # The revision numbers of the first commits on any branches on
 174     # which commits occurred.  This dependency is kept explicitly
 175     # because otherwise a revision-only topological sort would miss
 176     # the dependency that exists via branches_data.
 177     self.branches_revs_data = []
 178
 179     # The _TagData instances of tags that are connected to this
 180     # revision.
 181     self.tags_data = []
 182
 183     # A token that may be set by a RevisionCollector, then used by
 184     # RevisionReader to obtain the text again.
 185     self.revision_reader_token = None
 186
 187   def get_first_on_branch_id(self):
 188     return self.parent_branch_data and self.parent_branch_data.id
 189
 190
 191 class _SymbolData:
 192   """Collection area for information about a symbol in a single CVSFile.
 193
 194   SYMBOL is an instance of Symbol, undifferentiated as a Branch or a
 195   Tag regardless of whether self is a _BranchData or a _TagData."""
 196
 197   def __init__(self, id, symbol):
 198     """Initialize an object for SYMBOL."""
 199
 200     # The unique id that will be used for this particular symbol in
 201     # this particular file.  This same id will be used for the CVSItem
 202     # that is derived from this instance.
 203     self.id = id
 204
 205     # An instance of Symbol.
 206     self.symbol = symbol
 207
 208
 209 class _BranchData(_SymbolData):
 210   """Collection area for information about a Branch in a single CVSFile."""
 211
 212   def __init__(self, id, symbol, branch_number):
 213     _SymbolData.__init__(self, id, symbol)
 214
 215     # The branch number (e.g., '1.5.2') of this branch.
 216     self.branch_number = branch_number
 217
 218     # The revision number of the revision from which this branch
 219     # sprouts (e.g., '1.5').
 220     self.parent = self.branch_number[:self.branch_number.rindex(".")]
 221
 222     # The revision number of the first commit on this branch, if any
 223     # (e.g., '1.5.2.1'); otherwise, None.
 224     self.child = None
 225
 226
 227 class _TagData(_SymbolData):
 228   """Collection area for information about a Tag in a single CVSFile."""
 229
 230   def __init__(self, id, symbol, rev):
 231     _SymbolData.__init__(self, id, symbol)
 232
 233     # The revision number being tagged (e.g., '1.5.2.3').
 234     self.rev = rev
 235
 236
 237 class _SymbolDataCollector(object):
 238   """Collect information about symbols in a single CVSFile."""
 239
 240   def __init__(self, fdc, cvs_file):
 241     self.fdc = fdc
 242     self.cvs_file = cvs_file
 243
 244     self.pdc = self.fdc.pdc
 245     self.collect_data = self.fdc.collect_data
 246
 247     # A list [(name, revision), ...] of symbols defined in the header
 248     # of the file.  The name has already been transformed using the
 249     # symbol transform rules.  If the symbol transform rules indicate
 250     # that the symbol should be ignored, then it is never added to
 251     # this list.  This list is processed then deleted in
 252     # process_symbols().
 253     self._symbol_defs = []
 254
 255     # A set containing the transformed names of symbols in this file
 256     # (used to detect duplicates during processing of unlabeled
 257     # branches):
 258     self._defined_symbols = set()
 259
 260     # Map { branch_number : _BranchData }, where branch_number has an
 261     # odd number of digits.
 262     self.branches_data = { }
 263
 264     # Map { revision : [ tag_data ] }, where revision has an even
 265     # number of digits, and the value is a list of _TagData objects
 266     # for tags that apply to that revision.
 267     self.tags_data = { }
 268
 269   def _add_branch(self, name, branch_number):
 270     """Record that BRANCH_NUMBER is the branch number for branch NAME,
 271     and derive and record the revision from which NAME sprouts.
 272     BRANCH_NUMBER is an RCS branch number with an odd number of
 273     components, for example '1.7.2' (never '1.7.0.2').  Return the
 274     _BranchData instance (which is usually newly-created)."""
 275
 276     branch_data = self.branches_data.get(branch_number)
 277
 278     if branch_data is not None:
 279       Log().warn(
 280           "%s: in '%s':\n"
 281           "   branch '%s' already has name '%s',\n"
 282           "   cannot also have name '%s', ignoring the latter\n"
 283           % (warning_prefix,
 284              self.cvs_file.filename, branch_number,
 285              branch_data.symbol.name, name)
 286           )
 287       return branch_data
 288
 289     symbol = self.pdc.get_symbol(name)
 290     branch_data = _BranchData(
 291         self.collect_data.item_key_generator.gen_id(), symbol, branch_number
 292         )
 293     self.branches_data[branch_number] = branch_data
 294     return branch_data
 295
 296   def _construct_distinct_name(self, name, original_name):
 297     """Construct a distinct symbol name from NAME.
 298
 299     If NAME is distinct, return it.  If it is already used in this
 300     file (as determined from its presence in self._defined_symbols),
 301     construct and return a new name that is not already used."""
 302
 303     if name not in self._defined_symbols:
 304       return name
 305     else:
 306       index = 1
 307       while True:
 308         dup_name = '%s-DUPLICATE-%d' % (name, index,)
 309         if dup_name not in self._defined_symbols:
 310           self.collect_data.record_fatal_error(
 311               "Symbol name '%s' is already used in '%s'.\n"
 312               "The unlabeled branch '%s' must be renamed using "
 313               "--symbol-transform."
 314               % (name, self.cvs_file.filename, original_name,)
 315               )
 316           return dup_name
 317
 318   def _add_unlabeled_branch(self, branch_number):
 319     original_name = "unlabeled-" + branch_number
 320     name = self.transform_symbol(original_name, branch_number)
 321     if name is None:
 322       self.collect_data.record_fatal_error(
 323           "The unlabeled branch '%s' in '%s' contains commits.\n"
 324           "It may not be ignored via a symbol transform.  (Use --exclude "
 325           "instead.)"
 326           % (original_name, self.cvs_file.filename,)
 327           )
 328       # Retain the original name to allow the conversion to continue:
 329       name = original_name
 330
 331     distinct_name = self._construct_distinct_name(name, original_name)
 332     self._defined_symbols.add(distinct_name)
 333     return self._add_branch(distinct_name, branch_number)
 334
 335   def _add_tag(self, name, revision):
 336     """Record that tag NAME refers to the specified REVISION."""
 337
 338     symbol = self.pdc.get_symbol(name)
 339     tag_data = _TagData(
 340         self.collect_data.item_key_generator.gen_id(), symbol, revision
 341         )
 342     self.tags_data.setdefault(revision, []).append(tag_data)
 343     return tag_data
 344
 345   def transform_symbol(self, name, revision):
 346     """Transform a symbol according to the project's symbol transforms.
 347
 348     Transform the symbol with the original name NAME and canonicalized
 349     revision number REVISION.  Return the new symbol name or None if
 350     the symbol should be ignored entirely.
 351
 352     Log the results of the symbol transform if necessary."""
 353
 354     old_name = name
 355     # Apply any user-defined symbol transforms to the symbol name:
 356     name = self.cvs_file.project.transform_symbol(
 357         self.cvs_file, name, revision
 358         )
 359
 360     if name is None:
 361       # Ignore symbol:
 362       self.pdc.log_symbol_transform(old_name, None)
 363       Log().verbose(
 364           "   symbol '%s'=%s ignored in %s"
 365           % (old_name, revision, self.cvs_file.filename,)
 366           )
 367     else:
 368       if name != old_name:
 369         self.pdc.log_symbol_transform(old_name, name)
 370         Log().verbose(
 371             "   symbol '%s'=%s transformed to '%s' in %s"
 372             % (old_name, revision, name, self.cvs_file.filename,)
 373             )
 374
 375     return name
 376
 377   def define_symbol(self, name, revision):
 378     """Record a symbol definition for later processing."""
 379
 380     # Canonicalize the revision number:
 381     revision = _branch_revision_re.sub(r'\1\2', revision)
 382
 383     # Apply any user-defined symbol transforms to the symbol name:
 384     name = self.transform_symbol(name, revision)
 385
 386     if name is not None:
 387       # Verify that the revision number is valid:
 388       if _valid_revision_re.match(revision):
 389         # The revision number is valid; record it for later processing:
 390         self._symbol_defs.append( (name, revision) )
 391       else:
 392         Log().warn(
 393             'In %r:\n'
 394             '    branch %r references invalid revision %s\n'
 395             '    and will be ignored.'
 396             % (self.cvs_file.filename, name, revision,)
 397             )
 398
 399   def _eliminate_trivial_duplicate_defs(self, symbol_defs):
 400     """Iterate through SYMBOL_DEFS, Removing identical duplicate definitions.
 401
 402     Duplicate definitions of symbol names have been seen in the wild,
 403     and they can also happen when --symbol-transform is used.  If a
 404     symbol is defined to the same revision number repeatedly, then
 405     ignore all but the last definition."""
 406
 407     # Make a copy, since we have to iterate through the definitions
 408     # twice:
 409     symbol_defs = list(symbol_defs)
 410
 411     # A map { (name, revision) : [index,...] } of the indexes where
 412     # symbol definitions name=revision were found:
 413     known_definitions = {}
 414     for (i, symbol_def) in enumerate(symbol_defs):
 415       known_definitions.setdefault(symbol_def, []).append(i)
 416
 417     # A set of the indexes of entries that have to be removed from
 418     # symbol_defs:
 419     dup_indexes = set()
 420     for ((name, revision), indexes) in known_definitions.iteritems():
 421       if len(indexes) > 1:
 422         Log().verbose(
 423             "in %r:\n"
 424             "   symbol %s:%s defined multiple times; ignoring duplicates\n"
 425             % (self.cvs_file.filename, name, revision,)
 426             )
 427         dup_indexes.update(indexes[:-1])
 428
 429     for (i, symbol_def) in enumerate(symbol_defs):
 430       if i not in dup_indexes:
 431         yield symbol_def
 432
 433   def _process_duplicate_defs(self, symbol_defs):
 434     """Iterate through SYMBOL_DEFS, processing duplicate names.
 435
 436     Duplicate definitions of symbol names have been seen in the wild,
 437     and they can also happen when --symbol-transform is used.  If a
 438     symbol is defined multiple times, then it is a fatal error.  This
 439     method should be called after _eliminate_trivial_duplicate_defs()."""
 440
 441     # Make a copy, since we have to access multiple times:
 442     symbol_defs = list(symbol_defs)
 443
 444     # A map {name : [index,...]} mapping the names of symbols to a
 445     # list of their definitions' indexes in symbol_defs:
 446     known_symbols = {}
 447     for (i, (name, revision)) in enumerate(symbol_defs):
 448       known_symbols.setdefault(name, []).append(i)
 449
 450     known_symbols = known_symbols.items()
 451     known_symbols.sort()
 452     dup_indexes = set()
 453     for (name, indexes) in known_symbols:
 454       if len(indexes) > 1:
 455         # This symbol was defined multiple times.
 456         self.collect_data.record_fatal_error(
 457             "Multiple definitions of the symbol '%s' in '%s': %s" % (
 458                 name, self.cvs_file.filename,
 459                 ' '.join([symbol_defs[i][1] for i in indexes]),
 460                 )
 461             )
 462         # Ignore all but the last definition for now, to allow the
 463         # conversion to proceed:
 464         dup_indexes.update(indexes[:-1])
 465
 466     for (i, symbol_def) in enumerate(symbol_defs):
 467       if i not in dup_indexes:
 468         yield symbol_def
 469
 470   def _process_symbol(self, name, revision):
 471     """Process a symbol called NAME, which is associated with REVISON.
 472
 473     REVISION is a canonical revision number with zeros removed, for
 474     example: '1.7', '1.7.2', or '1.1.1' or '1.1.1.1'.  NAME is a
 475     transformed branch or tag name."""
 476
 477     # Add symbol to our records:
 478     if is_branch_revision_number(revision):
 479       self._add_branch(name, revision)
 480     else:
 481       self._add_tag(name, revision)
 482
 483   def process_symbols(self):
 484     """Process the symbol definitions from SELF._symbol_defs."""
 485
 486     symbol_defs = self._symbol_defs
 487     del self._symbol_defs
 488
 489     symbol_defs = self._eliminate_trivial_duplicate_defs(symbol_defs)
 490     symbol_defs = self._process_duplicate_defs(symbol_defs)
 491
 492     for (name, revision) in symbol_defs:
 493       self._defined_symbols.add(name)
 494       self._process_symbol(name, revision)
 495
 496   @staticmethod
 497   def rev_to_branch_number(revision):
 498     """Return the branch_number of the branch on which REVISION lies.
 499
 500     REVISION is a branch revision number with an even number of
 501     components; for example '1.7.2.1' (never '1.7.2' nor '1.7.0.2').
 502     The return value is the branch number (for example, '1.7.2').
 503     Return none iff REVISION is a trunk revision such as '1.2'."""
 504
 505     if is_trunk_revision(revision):
 506       return None
 507     return revision[:revision.rindex(".")]
 508
 509   def rev_to_branch_data(self, revision):
 510     """Return the branch_data of the branch on which REVISION lies.
 511
 512     REVISION must be a branch revision number with an even number of
 513     components; for example '1.7.2.1' (never '1.7.2' nor '1.7.0.2').
 514     Raise KeyError iff REVISION is unknown."""
 515
 516     assert not is_trunk_revision(revision)
 517
 518     return self.branches_data[self.rev_to_branch_number(revision)]
 519
 520   def rev_to_lod(self, revision):
 521     """Return the line of development on which REVISION lies.
 522
 523     REVISION must be a revision number with an even number of
 524     components.  Raise KeyError iff REVISION is unknown."""
 525
 526     if is_trunk_revision(revision):
 527       return self.pdc.trunk
 528     else:
 529       return self.rev_to_branch_data(revision).symbol
 530
 531
 532 class _FileDataCollector(cvs2svn_rcsparse.Sink):
 533   """Class responsible for collecting RCS data for a particular file.
 534
 535   Any collected data that need to be remembered are stored into the
 536   referenced CollectData instance."""
 537
 538   def __init__(self, pdc, cvs_file):
 539     """Create an object that is prepared to receive data for CVS_FILE.
 540     CVS_FILE is a CVSFile instance.  COLLECT_DATA is used to store the
 541     information collected about the file."""
 542
 543     self.pdc = pdc
 544     self.cvs_file = cvs_file
 545
 546     self.collect_data = self.pdc.collect_data
 547     self.project = self.cvs_file.project
 548
 549     # A place to store information about the symbols in this file:
 550     self.sdc = _SymbolDataCollector(self, self.cvs_file)
 551
 552     # { revision : _RevisionData instance }
 553     self._rev_data = { }
 554
 555     # Lists [ (parent, child) ] of revision number pairs indicating
 556     # that revision child depends on revision parent along the main
 557     # line of development.
 558     self._primary_dependencies = []
 559
 560     # If set, this is an RCS branch number -- rcsparse calls this the
 561     # "principal branch", but CVS and RCS refer to it as the "default
 562     # branch", so that's what we call it, even though the rcsparse API
 563     # setter method is still 'set_principal_branch'.
 564     self.default_branch = None
 565
 566     # True iff revision 1.1 of the file appears to have been imported
 567     # (as opposed to added normally).
 568     self._file_imported = False
 569
 570   def _get_rev_id(self, revision):
 571     if revision is None:
 572       return None
 573     return self._rev_data[revision].cvs_rev_id
 574
 575   def set_principal_branch(self, branch):
 576     """This is a callback method declared in Sink."""
 577
 578     if branch.find('.') == -1:
 579       # This just sets the default branch to trunk.  Normally this
 580       # shouldn't occur, but it has been seen in at least one CVS
 581       # repository.  Just ignore it.
 582       pass
 583     else:
 584       self.default_branch = branch
 585
 586   def set_expansion(self, mode):
 587     """This is a callback method declared in Sink."""
 588
 589     self.cvs_file.mode = mode
 590
 591   def set_description(self, description):
 592     """This is a callback method declared in Sink."""
 593
 594     self.cvs_file.description = description
 595
 596   def define_tag(self, name, revision):
 597     """Remember the symbol name and revision, but don't process them yet.
 598
 599     This is a callback method declared in Sink."""
 600
 601     self.sdc.define_symbol(name, revision)
 602
 603   def admin_completed(self):
 604     """This is a callback method declared in Sink."""
 605
 606     self.sdc.process_symbols()
 607
 608   def define_revision(self, revision, timestamp, author, state,
 609                       branches, next):
 610     """This is a callback method declared in Sink."""
 611
 612     for branch in branches:
 613       try:
 614         branch_data = self.sdc.rev_to_branch_data(branch)
 615       except KeyError:
 616         # Normally we learn about the branches from the branch names
 617         # and numbers parsed from the symbolic name header.  But this
 618         # must have been an unlabeled branch that slipped through the
 619         # net.  Generate a name for it and create a _BranchData record
 620         # for it now.
 621         branch_data = self.sdc._add_unlabeled_branch(
 622             self.sdc.rev_to_branch_number(branch))
 623
 624       assert branch_data.child is None
 625       branch_data.child = branch
 626
 627     if revision in self._rev_data:
 628       # This revision has already been seen.
 629       Log().error('File %r contains duplicate definitions of revision %s.'
 630                   % (self.cvs_file.filename, revision,))
 631       raise RuntimeError
 632
 633     # Record basic information about the revision:
 634     rev_data = _RevisionData(
 635         self.collect_data.item_key_generator.gen_id(),
 636         revision, int(timestamp), author, state)
 637     self._rev_data[revision] = rev_data
 638
 639     # When on trunk, the RCS 'next' revision number points to what
 640     # humans might consider to be the 'previous' revision number.  For
 641     # example, 1.3's RCS 'next' is 1.2.
 642     #
 643     # However, on a branch, the RCS 'next' revision number really does
 644     # point to what humans would consider to be the 'next' revision
 645     # number.  For example, 1.1.2.1's RCS 'next' would be 1.1.2.2.
 646     #
 647     # In other words, in RCS, 'next' always means "where to find the next
 648     # deltatext that you need this revision to retrieve.
 649     #
 650     # That said, we don't *want* RCS's behavior here, so we determine
 651     # whether we're on trunk or a branch and set the dependencies
 652     # accordingly.
 653     if next:
 654       if is_trunk_revision(revision):
 655         self._primary_dependencies.append( (next, revision,) )
 656       else:
 657         self._primary_dependencies.append( (revision, next,) )
 658
 659   def _resolve_primary_dependencies(self):
 660     """Resolve the dependencies listed in self._primary_dependencies."""
 661
 662     for (parent, child,) in self._primary_dependencies:
 663       parent_data = self._rev_data[parent]
 664       assert parent_data.child is None
 665       parent_data.child = child
 666
 667       child_data = self._rev_data[child]
 668       assert child_data.parent is None
 669       child_data.parent = parent
 670
 671   def _resolve_branch_dependencies(self):
 672     """Resolve dependencies involving branches."""
 673
 674     for branch_data in self.sdc.branches_data.values():
 675       # The branch_data's parent has the branch as a child regardless
 676       # of whether the branch had any subsequent commits:
 677       try:
 678         parent_data = self._rev_data[branch_data.parent]
 679       except KeyError:
 680         Log().warn(
 681             'In %r:\n'
 682             '    branch %r references non-existing revision %s\n'
 683             '    and will be ignored.'
 684             % (self.cvs_file.filename, branch_data.symbol.name,
 685                branch_data.parent,))
 686         del self.sdc.branches_data[branch_data.branch_number]
 687       else:
 688         parent_data.branches_data.append(branch_data)
 689
 690         # If the branch has a child (i.e., something was committed on
 691         # the branch), then we store a reference to the branch_data
 692         # there, define the child's parent to be the branch's parent,
 693         # and list the child in the branch parent's branches_revs_data:
 694         if branch_data.child is not None:
 695           child_data = self._rev_data[branch_data.child]
 696           assert child_data.parent_branch_data is None
 697           child_data.parent_branch_data = branch_data
 698           assert child_data.parent is None
 699           child_data.parent = branch_data.parent
 700           parent_data.branches_revs_data.append(branch_data.child)
 701
 702   def _sort_branches(self):
 703     """Sort the branches sprouting from each revision in creation order.
 704
 705     Creation order is taken to be the reverse of the order that they
 706     are listed in the symbols part of the RCS file.  (If a branch is
 707     created then deleted, a later branch can be assigned the recycled
 708     branch number; therefore branch numbers are not an indication of
 709     creation order.)"""
 710
 711     for rev_data in self._rev_data.values():
 712       rev_data.branches_data.sort(lambda a, b: - cmp(a.id, b.id))
 713
 714   def _resolve_tag_dependencies(self):
 715     """Resolve dependencies involving tags."""
 716
 717     for (rev, tag_data_list) in self.sdc.tags_data.items():
 718       try:
 719         parent_data = self._rev_data[rev]
 720       except KeyError:
 721         Log().warn(
 722             'In %r:\n'
 723             '    the following tag(s) reference non-existing revision %s\n'
 724             '    and will be ignored:\n'
 725             '    %s' % (
 726                 self.cvs_file.filename, rev,
 727                 ', '.join([repr(tag_data.symbol.name)
 728                            for tag_data in tag_data_list]),))
 729         del self.sdc.tags_data[rev]
 730       else:
 731         for tag_data in tag_data_list:
 732           assert tag_data.rev == rev
 733           # The tag_data's rev has the tag as a child:
 734           parent_data.tags_data.append(tag_data)
 735
 736   def _determine_operation(self, rev_data):
 737     prev_rev_data = self._rev_data.get(rev_data.parent)
 738     return cvs_revision_type_map[(
 739         rev_data.state != 'dead',
 740         prev_rev_data is not None and prev_rev_data.state != 'dead',
 741         )]
 742
 743   def _get_cvs_revision(self, rev_data):
 744     """Create and return a CVSRevision for REV_DATA."""
 745
 746     branch_ids = [
 747         branch_data.id
 748         for branch_data in rev_data.branches_data
 749         ]
 750
 751     branch_commit_ids = [
 752         self._get_rev_id(rev)
 753         for rev in rev_data.branches_revs_data
 754         ]
 755
 756     tag_ids = [
 757         tag_data.id
 758         for tag_data in rev_data.tags_data
 759         ]
 760
 761     revision_type = self._determine_operation(rev_data)
 762
 763     return revision_type(
 764         self._get_rev_id(rev_data.rev), self.cvs_file,
 765         rev_data.timestamp, None,
 766         self._get_rev_id(rev_data.parent),
 767         self._get_rev_id(rev_data.child),
 768         rev_data.rev,
 769         True,
 770         self.sdc.rev_to_lod(rev_data.rev),
 771         rev_data.get_first_on_branch_id(),
 772         False, None, None,
 773         tag_ids, branch_ids, branch_commit_ids,
 774         rev_data.revision_reader_token
 775         )
 776
 777   def _get_cvs_revisions(self):
 778     """Generate the CVSRevisions present in this file."""
 779
 780     for rev_data in self._rev_data.itervalues():
 781       yield self._get_cvs_revision(rev_data)
 782
 783   def _get_cvs_branches(self):
 784     """Generate the CVSBranches present in this file."""
 785
 786     for branch_data in self.sdc.branches_data.values():
 787       yield CVSBranch(
 788           branch_data.id, self.cvs_file, branch_data.symbol,
 789           branch_data.branch_number,
 790           self.sdc.rev_to_lod(branch_data.parent),
 791           self._get_rev_id(branch_data.parent),
 792           self._get_rev_id(branch_data.child),
 793           None,
 794           )
 795
 796   def _get_cvs_tags(self):
 797     """Generate the CVSTags present in this file."""
 798
 799     for tags_data in self.sdc.tags_data.values():
 800       for tag_data in tags_data:
 801         yield CVSTag(
 802             tag_data.id, self.cvs_file, tag_data.symbol,
 803             self.sdc.rev_to_lod(tag_data.rev),
 804             self._get_rev_id(tag_data.rev),
 805             None,
 806             )
 807
 808   def tree_completed(self):
 809     """The revision tree has been parsed.
 810
 811     Analyze it for consistency and connect some loose ends.
 812
 813     This is a callback method declared in Sink."""
 814
 815     self._resolve_primary_dependencies()
 816     self._resolve_branch_dependencies()
 817     self._sort_branches()
 818     self._resolve_tag_dependencies()
 819
 820     # Compute the preliminary CVSFileItems for this file:
 821     cvs_items = []
 822     cvs_items.extend(self._get_cvs_revisions())
 823     cvs_items.extend(self._get_cvs_branches())
 824     cvs_items.extend(self._get_cvs_tags())
 825     self._cvs_file_items = CVSFileItems(
 826         self.cvs_file, self.pdc.trunk, cvs_items
 827         )
 828
 829     self._cvs_file_items.check_link_consistency()
 830
 831   def set_revision_info(self, revision, log, text):
 832     """This is a callback method declared in Sink."""
 833
 834     rev_data = self._rev_data[revision]
 835     cvs_rev = self._cvs_file_items[rev_data.cvs_rev_id]
 836
 837     if cvs_rev.metadata_id is not None:
 838       # Users have reported problems with repositories in which the
 839       # deltatext block for revision 1.1 appears twice.  It is not
 840       # known whether this results from a CVS/RCS bug, or from botched
 841       # hand-editing of the repository.  In any case, empirically, cvs
 842       # and rcs both use the first version when checking out data, so
 843       # that's what we will do.  (For the record: "cvs log" fails on
 844       # such a file; "rlog" prints the log message from the first
 845       # block and ignores the second one.)
 846       Log().warn(
 847           "%s: in '%s':\n"
 848           "   Deltatext block for revision %s appeared twice;\n"
 849           "   ignoring the second occurrence.\n"
 850           % (warning_prefix, self.cvs_file.filename, revision,)
 851           )
 852       return
 853
 854     if is_trunk_revision(revision):
 855       branch_name = None
 856     else:
 857       branch_name = self.sdc.rev_to_branch_data(revision).symbol.name
 858
 859     cvs_rev.metadata_id = self.collect_data.metadata_logger.store(
 860         self.project, branch_name, rev_data.author, log
 861         )
 862     cvs_rev.deltatext_exists = bool(text)
 863
 864     # If this is revision 1.1, determine whether the file appears to
 865     # have been created via 'cvs add' instead of 'cvs import'.  The
 866     # test is that the log message CVS uses for 1.1 in imports is
 867     # "Initial revision\n" with no period.  (This fact helps determine
 868     # whether this file might have had a default branch in the past.)
 869     if revision == '1.1':
 870       self._file_imported = (log == 'Initial revision\n')
 871
 872   def parse_completed(self):
 873     """Finish the processing of this file.
 874
 875     This is a callback method declared in Sink."""
 876
 877     # Make sure that there was an info section for each revision:
 878     for cvs_item in self._cvs_file_items.values():
 879       if isinstance(cvs_item, CVSRevision) and cvs_item.metadata_id is None:
 880         self.collect_data.record_fatal_error(
 881             '%r has no deltatext section for revision %s'
 882             % (self.cvs_file.filename, cvs_item.rev,)
 883             )
 884
 885   def _process_ntdbrs(self):
 886     """Fix up any non-trunk default branch revisions (if present).
 887
 888     If a non-trunk default branch is determined to have existed, yield
 889     the _RevisionData.ids for all revisions that were once non-trunk
 890     default revisions, in dependency order.
 891
 892     There are two cases to handle:
 893
 894     One case is simple.  The RCS file lists a default branch
 895     explicitly in its header, such as '1.1.1'.  In this case, we know
 896     that every revision on the vendor branch is to be treated as head
 897     of trunk at that point in time.
 898
 899     But there's also a degenerate case.  The RCS file does not
 900     currently have a default branch, yet we can deduce that for some
 901     period in the past it probably *did* have one.  For example, the
 902     file has vendor revisions 1.1.1.1 -> 1.1.1.96, all of which are
 903     dated before 1.2, and then it has 1.1.1.97 -> 1.1.1.100 dated
 904     after 1.2.  In this case, we should record 1.1.1.96 as the last
 905     vendor revision to have been the head of the default branch.
 906
 907     If any non-trunk default branch revisions are found:
 908
 909     - Set their ntdbr members to True.
 910
 911     - Connect the last one with revision 1.2.
 912
 913     - Remove revision 1.1 if it is not needed.
 914
 915     """
 916
 917     try:
 918       if self.default_branch:
 919         vendor_cvs_branch_id = self.sdc.branches_data[self.default_branch].id
 920         vendor_lod_items = self._cvs_file_items.get_lod_items(
 921             self._cvs_file_items[vendor_cvs_branch_id]
 922             )
 923         if not self._cvs_file_items.process_live_ntdb(vendor_lod_items):
 924           return
 925       elif self._file_imported:
 926         vendor_branch_data = self.sdc.branches_data.get('1.1.1')
 927         if vendor_branch_data is None:
 928           return
 929         else:
 930           vendor_lod_items = self._cvs_file_items.get_lod_items(
 931               self._cvs_file_items[vendor_branch_data.id]
 932               )
 933           if not self._cvs_file_items.process_historical_ntdb(
 934                 vendor_lod_items
 935                 ):
 936             return
 937       else:
 938         return
 939     except VendorBranchError, e:
 940       self.collect_data.record_fatal_error(str(e))
 941       return
 942
 943     if self._file_imported:
 944       self._cvs_file_items.imported_remove_1_1(vendor_lod_items)
 945
 946     self._cvs_file_items.check_link_consistency()
 947
 948   def get_cvs_file_items(self):
 949     """Finish up and return a CVSFileItems instance for this file.
 950
 951     This method must only be called once."""
 952
 953     self._process_ntdbrs()
 954
 955     # Break a circular reference loop, allowing the memory for self
 956     # and sdc to be freed.
 957     del self.sdc
 958
 959     return self._cvs_file_items
 960
 961
 962 class _ProjectDataCollector:
 963   def __init__(self, collect_data, project):
 964     self.collect_data = collect_data
 965     self.project = project
 966     self.num_files = 0
 967
 968     # The Trunk LineOfDevelopment object for this project:
 969     self.trunk = Trunk(
 970         self.collect_data.symbol_key_generator.gen_id(), self.project
 971         )
 972     self.project.trunk_id = self.trunk.id
 973
 974     # This causes a record for self.trunk to spring into existence:
 975     self.collect_data.register_trunk(self.trunk)
 976
 977     # A map { name -> Symbol } for all known symbols in this project.
 978     # The symbols listed here are undifferentiated into Branches and
 979     # Tags because the same name might appear as a branch in one file
 980     # and a tag in another.
 981     self.symbols = {}
 982
 983     # A map { (old_name, new_name) : count } indicating how many files
 984     # were affected by each each symbol name transformation:
 985     self.symbol_transform_counts = {}
 986
 987   def get_symbol(self, name):
 988     """Return the Symbol object for the symbol named NAME in this project.
 989
 990     If such a symbol does not yet exist, allocate a new symbol_id,
 991     create a Symbol instance, store it in self.symbols, and return it."""
 992
 993     symbol = self.symbols.get(name)
 994     if symbol is None:
 995       symbol = Symbol(
 996           self.collect_data.symbol_key_generator.gen_id(),
 997           self.project, name)
 998       self.symbols[name] = symbol
 999     return symbol
1000
1001   def log_symbol_transform(self, old_name, new_name):
1002     """Record that OLD_NAME was transformed to NEW_NAME in one file.
1003
1004     This information is used to generated a statistical summary of
1005     symbol transforms."""
1006
1007     try:
1008       self.symbol_transform_counts[old_name, new_name] += 1
1009     except KeyError:
1010       self.symbol_transform_counts[old_name, new_name] = 1
1011
1012   def summarize_symbol_transforms(self):
1013     if self.symbol_transform_counts and Log().is_on(Log.NORMAL):
1014       log = Log()
1015       log.normal('Summary of symbol transforms:')
1016       transforms = self.symbol_transform_counts.items()
1017       transforms.sort()
1018       for ((old_name, new_name), count) in transforms:
1019         if new_name is None:
1020           log.normal('    "%s" ignored in %d files' % (old_name, count,))
1021         else:
1022           log.normal(
1023               '    "%s" transformed to "%s" in %d files'
1024               % (old_name, new_name, count,)
1025               )
1026
1027   def process_file(self, cvs_file):
1028     Log().normal(cvs_file.filename)
1029     fdc = _FileDataCollector(self, cvs_file)
1030     try:
1031       cvs2svn_rcsparse.parse(open(cvs_file.filename, 'rb'), fdc)
1032     except (cvs2svn_rcsparse.common.RCSParseError, ValueError, RuntimeError):
1033       self.collect_data.record_fatal_error(
1034           "%r is not a valid ,v file" % (cvs_file.filename,)
1035           )
1036       # Abort the processing of this file, but let the pass continue
1037       # with other files:
1038       return
1039     except:
1040       Log().warn("Exception occurred while parsing %s" % cvs_file.filename)
1041       raise
1042     else:
1043       self.num_files += 1
1044
1045     return fdc.get_cvs_file_items()
1046
1047
1048 class CollectData:
1049   """Repository for data collected by parsing the CVS repository files.
1050
1051   This class manages the databases into which information collected
1052   from the CVS repository is stored.  The data are stored into this
1053   class by _FileDataCollector instances, one of which is created for
1054   each file to be parsed."""
1055
1056   def __init__(self, stats_keeper):
1057     self._cvs_item_store = NewCVSItemStore(
1058         artifact_manager.get_temp_file(config.CVS_ITEMS_STORE))
1059     self.metadata_db = MetadataDatabase(
1060         artifact_manager.get_temp_file(config.METADATA_STORE),
1061         artifact_manager.get_temp_file(config.METADATA_INDEX_TABLE),
1062         DB_OPEN_NEW,
1063         )
1064     self.metadata_logger = MetadataLogger(self.metadata_db)
1065     self.fatal_errors = []
1066     self.num_files = 0
1067     self.symbol_stats = SymbolStatisticsCollector()
1068     self.stats_keeper = stats_keeper
1069
1070     # Key generator for CVSFiles:
1071     self.file_key_generator = KeyGenerator()
1072
1073     # Key generator for CVSItems:
1074     self.item_key_generator = KeyGenerator()
1075
1076     # Key generator for Symbols:
1077     self.symbol_key_generator = KeyGenerator()
1078
1079   def record_fatal_error(self, err):
1080     """Record that fatal error ERR was found.
1081
1082     ERR is a string (without trailing newline) describing the error.
1083     Output the error to stderr immediately, and record a copy to be
1084     output again in a summary at the end of CollectRevsPass."""
1085
1086     err = '%s: %s' % (error_prefix, err,)
1087     Log().error(err + '\n')
1088     self.fatal_errors.append(err)
1089
1090   def add_cvs_directory(self, cvs_directory):
1091     """Record CVS_DIRECTORY."""
1092
1093     Ctx()._cvs_path_db.log_path(cvs_directory)
1094
1095   def add_cvs_file_items(self, cvs_file_items):
1096     """Record the information from CVS_FILE_ITEMS.
1097
1098     Store the CVSFile to _cvs_path_db under its persistent id, store
1099     the CVSItems, and record the CVSItems to self.stats_keeper."""
1100
1101     Ctx()._cvs_path_db.log_path(cvs_file_items.cvs_file)
1102     self._cvs_item_store.add(cvs_file_items)
1103
1104     self.stats_keeper.record_cvs_file(cvs_file_items.cvs_file)
1105     for cvs_item in cvs_file_items.values():
1106       self.stats_keeper.record_cvs_item(cvs_item)
1107
1108   def register_trunk(self, trunk):
1109     """Create a symbol statistics record for the specified trunk LOD."""
1110
1111     # This causes a record to spring into existence:
1112     self.symbol_stats[trunk]
1113
1114   def _process_cvs_file_items(self, cvs_file_items):
1115     """Process the CVSFileItems from one CVSFile."""
1116
1117     # Remove an initial delete on trunk if it is not needed:
1118     cvs_file_items.remove_unneeded_initial_trunk_delete(self.metadata_db)
1119
1120     # Remove initial branch deletes that are not needed:
1121     cvs_file_items.remove_initial_branch_deletes(self.metadata_db)
1122
1123     # If this is a --trunk-only conversion, discard all branches and
1124     # tags, then draft any non-trunk default branch revisions to
1125     # trunk:
1126     if Ctx().trunk_only:
1127       cvs_file_items.exclude_non_trunk()
1128
1129     cvs_file_items.check_link_consistency()
1130
1131     self.add_cvs_file_items(cvs_file_items)
1132     self.symbol_stats.register(cvs_file_items)
1133
1134   def process_project(self, project):
1135     Ctx()._projects[project.id] = project
1136
1137     pdc = _ProjectDataCollector(self, project)
1138
1139     found_rcs_file = False
1140     for cvs_path in walk_repository(
1141           project, self.file_key_generator, self.record_fatal_error
1142           ):
1143       if isinstance(cvs_path, CVSDirectory):
1144         self.add_cvs_directory(cvs_path)
1145       else:
1146         cvs_file_items = pdc.process_file(cvs_path)
1147         self._process_cvs_file_items(cvs_file_items)
1148         found_rcs_file = True
1149
1150     if not found_rcs_file:
1151       self.record_fatal_error(
1152           'No RCS files found under %r!\n'
1153           'Are you absolutely certain you are pointing cvs2svn\n'
1154           'at a CVS repository?\n'
1155           % (project.project_cvs_repos_path,)
1156           )
1157
1158     pdc.summarize_symbol_transforms()
1159
1160     self.num_files += pdc.num_files
1161     Log().verbose('Processed', self.num_files, 'files')
1162
1163   def _register_empty_subdirectories(self):
1164     """Set the CVSDirectory.empty_subdirectory_id members."""
1165
1166     directories = set(
1167         path
1168         for path in Ctx()._cvs_path_db.itervalues()
1169         if isinstance(path, CVSDirectory)
1170         )
1171     for path in Ctx()._cvs_path_db.itervalues():
1172       if isinstance(path, CVSFile):
1173         directory = path.parent_directory
1174         while directory is not None and directory in directories:
1175           directories.remove(directory)
1176           directory = directory.parent_directory
1177     for directory in directories:
1178       if directory.parent_directory is not None:
1179         directory.parent_directory.empty_subdirectory_ids.append(directory.id)
1180
1181   def _set_cvs_path_ordinals(self):
1182     cvs_files = list(Ctx()._cvs_path_db.itervalues())
1183     cvs_files.sort(CVSPath.slow_compare)
1184     for (i, cvs_file) in enumerate(cvs_files):
1185       cvs_file.ordinal = i
1186
1187   def close(self):
1188     """Close the data structures associated with this instance.
1189
1190     Return a list of fatal errors encountered while processing input.
1191     Each list entry is a string describing one fatal error."""
1192
1193     self.symbol_stats.purge_ghost_symbols()
1194     self.symbol_stats.close()
1195     self.symbol_stats = None
1196     self.metadata_logger = None
1197     self.metadata_db.close()
1198     self.metadata_db = None
1199     self._cvs_item_store.close()
1200     self._cvs_item_store = None
1201     self._register_empty_subdirectories()
1202     self._set_cvs_path_ordinals()
1203     retval = self.fatal_errors
1204     self.fatal_errors = None
1205     return retval
1206
1207