cvs2svn_lib/collect_data.py

   1 # (Be in -*- python -*- mode.)
   2 #
   3 # ====================================================================
   4 # Copyright (c) 2000-2009 CollabNet.  All rights reserved.
   5 #
   6 # This software is licensed as described in the file COPYING, which
   7 # you should have received as part of this distribution.  The terms
   8 # are also available at http://subversion.tigris.org/license-1.html.
   9 # If newer versions of this license are posted there, you may use a
  10 # newer version instead, at your option.
  11 #
  12 # This software consists of voluntary contributions made by many
  13 # individuals.  For exact contribution history, see the revision
  14 # history and logs, available at http://cvs2svn.tigris.org/.
  15 # ====================================================================
  16
  17 """Data collection classes.
  18
  19 This module contains the code used to collect data from the CVS
  20 repository.  It parses *,v files, recording all useful information
  21 except for the actual file contents (though even the file contents
  22 might be recorded by the RevisionRecorder if one is configured).
  23
  24 As a *,v file is parsed, the information pertaining to the file is
  25 accumulated in memory, mostly in _RevisionData, _BranchData, and
  26 _TagData objects.  When parsing is complete, a final pass is made over
  27 the data to create some final dependency links, collect statistics,
  28 etc., then the _*Data objects are converted into CVSItem objects
  29 (CVSRevision, CVSBranch, and CVSTag respectively) and the CVSItems are
  30 dumped into databases.
  31
  32 During the data collection, persistent unique ids are allocated to
  33 many types of objects: CVSFile, Symbol, and CVSItems.  CVSItems are a
  34 special case.  CVSItem ids are unique across all CVSItem types, and
  35 the ids are carried over from the corresponding data collection
  36 objects:
  37
  38     _RevisionData -> CVSRevision
  39
  40     _BranchData -> CVSBranch
  41
  42     _TagData -> CVSTag
  43
  44 In a later pass it is possible to convert tags <-> branches.  But even
  45 if this occurs, the new branch or tag uses the same id as the old tag
  46 or branch.
  47
  48 """
  49
  50
  51 import os
  52 import stat
  53 import re
  54
  55 from cvs2svn_lib import config
  56 from cvs2svn_lib.common import DB_OPEN_NEW
  57 from cvs2svn_lib.common import warning_prefix
  58 from cvs2svn_lib.common import error_prefix
  59 from cvs2svn_lib.log import Log
  60 from cvs2svn_lib.context import Ctx
  61 from cvs2svn_lib.artifact_manager import artifact_manager
  62 from cvs2svn_lib.cvs_path import CVSPath
  63 from cvs2svn_lib.cvs_path import CVSDirectory
  64 from cvs2svn_lib.symbol import Symbol
  65 from cvs2svn_lib.symbol import Trunk
  66 from cvs2svn_lib.cvs_item import CVSRevision
  67 from cvs2svn_lib.cvs_item import CVSBranch
  68 from cvs2svn_lib.cvs_item import CVSTag
  69 from cvs2svn_lib.cvs_item import cvs_revision_type_map
  70 from cvs2svn_lib.cvs_file_items import VendorBranchError
  71 from cvs2svn_lib.cvs_file_items import CVSFileItems
  72 from cvs2svn_lib.key_generator import KeyGenerator
  73 from cvs2svn_lib.cvs_item_database import NewCVSItemStore
  74 from cvs2svn_lib.symbol_statistics import SymbolStatisticsCollector
  75 from cvs2svn_lib.metadata_database import MetadataDatabase
  76 from cvs2svn_lib.metadata_database import MetadataLogger
  77 from cvs2svn_lib.repository_walker import walk_repository
  78
  79 import cvs2svn_rcsparse
  80
  81
  82 # A regular expression defining "valid" revision numbers (used to
  83 # check that symbol definitions are reasonable).
  84 _valid_revision_re = re.compile(r'''
  85     ^
  86     (?:\d+\.)+          # Digit groups with trailing dots
  87     \d+                 # And the last digit group.
  88     $
  89     ''', re.VERBOSE)
  90
  91 _branch_revision_re = re.compile(r'''
  92     ^
  93     ((?:\d+\.\d+\.)+)   # A nonzero even number of digit groups w/trailing dot
  94     (?:0\.)?            # CVS sticks an extra 0 here; RCS does not
  95     (\d+)               # And the last digit group
  96     $
  97     ''', re.VERBOSE)
  98
  99
 100 def rev_tuple(rev):
 101   """Return a tuple of integers corresponding to revision number REV.
 102
 103   For example, if REV is '1.2.3.4', then return (1,2,3,4)."""
 104
 105   return tuple([int(x) for x in rev.split('.')])
 106
 107
 108 def is_trunk_revision(rev):
 109   """Return True iff REV is a trunk revision.
 110
 111   REV is a revision number corresponding to a specific revision (i.e.,
 112   not a whole branch)."""
 113
 114   return rev.count('.') == 1
 115
 116
 117 def is_branch_revision_number(rev):
 118   """Return True iff REV is a branch revision number.
 119
 120   REV is a CVS revision number in canonical form (i.e., with zeros
 121   removed).  Return True iff it refers to a whole branch, as opposed
 122   to a single revision."""
 123
 124   return rev.count('.') % 2 == 0
 125
 126
 127 def is_same_line_of_development(rev1, rev2):
 128   """Return True if rev1 and rev2 are on the same line of
 129   development (i.e., both on trunk, or both on the same branch);
 130   return False otherwise.  Either rev1 or rev2 can be None, in
 131   which case automatically return False."""
 132
 133   if rev1 is None or rev2 is None:
 134     return False
 135   if rev1.count('.') == 1 and rev2.count('.') == 1:
 136     return True
 137   if rev1[0:rev1.rfind('.')] == rev2[0:rev2.rfind('.')]:
 138     return True
 139   return False
 140
 141
 142 class _RevisionData:
 143   """We track the state of each revision so that in set_revision_info,
 144   we can determine if our op is an add/change/delete.  We can do this
 145   because in set_revision_info, we'll have all of the _RevisionData
 146   for a file at our fingertips, and we need to examine the state of
 147   our prev_rev to determine if we're an add or a change.  Without the
 148   state of the prev_rev, we are unable to distinguish between an add
 149   and a change."""
 150
 151   def __init__(self, cvs_rev_id, rev, timestamp, author, state):
 152     # The id of this revision:
 153     self.cvs_rev_id = cvs_rev_id
 154     self.rev = rev
 155     self.timestamp = timestamp
 156     self.author = author
 157     self.original_timestamp = timestamp
 158     self.state = state
 159
 160     # If this is the first revision on a branch, then this is the
 161     # branch_data of that branch; otherwise it is None.
 162     self.parent_branch_data = None
 163
 164     # The revision number of the parent of this revision along the
 165     # same line of development, if any.  For the first revision R on a
 166     # branch, we consider the revision from which R sprouted to be the
 167     # 'parent'.  If this is the root revision in the file's revision
 168     # tree, then this field is None.
 169     #
 170     # Note that this revision can't be determined arithmetically (due
 171     # to cvsadmin -o), which is why this field is necessary.
 172     self.parent = None
 173
 174     # The revision number of the primary child of this revision (the
 175     # child along the same line of development), if any; otherwise,
 176     # None.
 177     self.child = None
 178
 179     # The _BranchData instances of branches that sprout from this
 180     # revision, sorted in ascending order by branch number.  It would
 181     # be inconvenient to initialize it here because we would have to
 182     # scan through all branches known by the _SymbolDataCollector to
 183     # find the ones having us as the parent.  Instead, this
 184     # information is filled in by
 185     # _FileDataCollector._resolve_dependencies() and sorted by
 186     # _FileDataCollector._sort_branches().
 187     self.branches_data = []
 188
 189     # The revision numbers of the first commits on any branches on
 190     # which commits occurred.  This dependency is kept explicitly
 191     # because otherwise a revision-only topological sort would miss
 192     # the dependency that exists via branches_data.
 193     self.branches_revs_data = []
 194
 195     # The _TagData instances of tags that are connected to this
 196     # revision.
 197     self.tags_data = []
 198
 199     # A token that may be returned from
 200     # RevisionRecorder.record_text().  It can be used by
 201     # RevisionReader to obtain the text again.
 202     self.revision_recorder_token = None
 203
 204   def get_first_on_branch_id(self):
 205     return self.parent_branch_data and self.parent_branch_data.id
 206
 207
 208 class _SymbolData:
 209   """Collection area for information about a symbol in a single CVSFile.
 210
 211   SYMBOL is an instance of Symbol, undifferentiated as a Branch or a
 212   Tag regardless of whether self is a _BranchData or a _TagData."""
 213
 214   def __init__(self, id, symbol):
 215     """Initialize an object for SYMBOL."""
 216
 217     # The unique id that will be used for this particular symbol in
 218     # this particular file.  This same id will be used for the CVSItem
 219     # that is derived from this instance.
 220     self.id = id
 221
 222     # An instance of Symbol.
 223     self.symbol = symbol
 224
 225
 226 class _BranchData(_SymbolData):
 227   """Collection area for information about a Branch in a single CVSFile."""
 228
 229   def __init__(self, id, symbol, branch_number):
 230     _SymbolData.__init__(self, id, symbol)
 231
 232     # The branch number (e.g., '1.5.2') of this branch.
 233     self.branch_number = branch_number
 234
 235     # The revision number of the revision from which this branch
 236     # sprouts (e.g., '1.5').
 237     self.parent = self.branch_number[:self.branch_number.rindex(".")]
 238
 239     # The revision number of the first commit on this branch, if any
 240     # (e.g., '1.5.2.1'); otherwise, None.
 241     self.child = None
 242
 243
 244 class _TagData(_SymbolData):
 245   """Collection area for information about a Tag in a single CVSFile."""
 246
 247   def __init__(self, id, symbol, rev):
 248     _SymbolData.__init__(self, id, symbol)
 249
 250     # The revision number being tagged (e.g., '1.5.2.3').
 251     self.rev = rev
 252
 253
 254 class _SymbolDataCollector(object):
 255   """Collect information about symbols in a single CVSFile."""
 256
 257   def __init__(self, fdc, cvs_file):
 258     self.fdc = fdc
 259     self.cvs_file = cvs_file
 260
 261     self.pdc = self.fdc.pdc
 262     self.collect_data = self.fdc.collect_data
 263
 264     # A list [(name, revision), ...] of symbols defined in the header
 265     # of the file.  The name has already been transformed using the
 266     # symbol transform rules.  If the symbol transform rules indicate
 267     # that the symbol should be ignored, then it is never added to
 268     # this list.  This list is processed then deleted in
 269     # process_symbols().
 270     self._symbol_defs = []
 271
 272     # A set containing the transformed names of symbols in this file
 273     # (used to detect duplicates during processing of unlabeled
 274     # branches):
 275     self._defined_symbols = set()
 276
 277     # Map { branch_number : _BranchData }, where branch_number has an
 278     # odd number of digits.
 279     self.branches_data = { }
 280
 281     # Map { revision : [ tag_data ] }, where revision has an even
 282     # number of digits, and the value is a list of _TagData objects
 283     # for tags that apply to that revision.
 284     self.tags_data = { }
 285
 286   def _add_branch(self, name, branch_number):
 287     """Record that BRANCH_NUMBER is the branch number for branch NAME,
 288     and derive and record the revision from which NAME sprouts.
 289     BRANCH_NUMBER is an RCS branch number with an odd number of
 290     components, for example '1.7.2' (never '1.7.0.2').  Return the
 291     _BranchData instance (which is usually newly-created)."""
 292
 293     branch_data = self.branches_data.get(branch_number)
 294
 295     if branch_data is not None:
 296       Log().warn(
 297           "%s: in '%s':\n"
 298           "   branch '%s' already has name '%s',\n"
 299           "   cannot also have name '%s', ignoring the latter\n"
 300           % (warning_prefix,
 301              self.cvs_file.filename, branch_number,
 302              branch_data.symbol.name, name)
 303           )
 304       return branch_data
 305
 306     symbol = self.pdc.get_symbol(name)
 307     branch_data = _BranchData(
 308         self.collect_data.item_key_generator.gen_id(), symbol, branch_number
 309         )
 310     self.branches_data[branch_number] = branch_data
 311     return branch_data
 312
 313   def _construct_distinct_name(self, name, original_name):
 314     """Construct a distinct symbol name from NAME.
 315
 316     If NAME is distinct, return it.  If it is already used in this
 317     file (as determined from its presence in self._defined_symbols),
 318     construct and return a new name that is not already used."""
 319
 320     if name not in self._defined_symbols:
 321       return name
 322     else:
 323       index = 1
 324       while True:
 325         dup_name = '%s-DUPLICATE-%d' % (name, index,)
 326         if dup_name not in self._defined_symbols:
 327           self.collect_data.record_fatal_error(
 328               "Symbol name '%s' is already used in '%s'.\n"
 329               "The unlabeled branch '%s' must be renamed using "
 330               "--symbol-transform."
 331               % (name, self.cvs_file.filename, original_name,)
 332               )
 333           return dup_name
 334
 335   def _add_unlabeled_branch(self, branch_number):
 336     original_name = "unlabeled-" + branch_number
 337     name = self.transform_symbol(original_name, branch_number)
 338     if name is None:
 339       self.collect_data.record_fatal_error(
 340           "The unlabeled branch '%s' in '%s' contains commits.\n"
 341           "It may not be ignored via a symbol transform.  (Use --exclude "
 342           "instead.)"
 343           % (original_name, self.cvs_file.filename,)
 344           )
 345       # Retain the original name to allow the conversion to continue:
 346       name = original_name
 347
 348     distinct_name = self._construct_distinct_name(name, original_name)
 349     self._defined_symbols.add(distinct_name)
 350     return self._add_branch(distinct_name, branch_number)
 351
 352   def _add_tag(self, name, revision):
 353     """Record that tag NAME refers to the specified REVISION."""
 354
 355     symbol = self.pdc.get_symbol(name)
 356     tag_data = _TagData(
 357         self.collect_data.item_key_generator.gen_id(), symbol, revision
 358         )
 359     self.tags_data.setdefault(revision, []).append(tag_data)
 360     return tag_data
 361
 362   def transform_symbol(self, name, revision):
 363     """Transform a symbol according to the project's symbol transforms.
 364
 365     Transform the symbol with the original name NAME and canonicalized
 366     revision number REVISION.  Return the new symbol name or None if
 367     the symbol should be ignored entirely.
 368
 369     Log the results of the symbol transform if necessary."""
 370
 371     old_name = name
 372     # Apply any user-defined symbol transforms to the symbol name:
 373     name = self.cvs_file.project.transform_symbol(
 374         self.cvs_file, name, revision
 375         )
 376
 377     if name is None:
 378       # Ignore symbol:
 379       self.pdc.log_symbol_transform(old_name, None)
 380       Log().verbose(
 381           "   symbol '%s'=%s ignored in %s"
 382           % (old_name, revision, self.cvs_file.filename,)
 383           )
 384     else:
 385       if name != old_name:
 386         self.pdc.log_symbol_transform(old_name, name)
 387         Log().verbose(
 388             "   symbol '%s'=%s transformed to '%s' in %s"
 389             % (old_name, revision, name, self.cvs_file.filename,)
 390             )
 391
 392     return name
 393
 394   def define_symbol(self, name, revision):
 395     """Record a symbol definition for later processing."""
 396
 397     # Canonicalize the revision number:
 398     revision = _branch_revision_re.sub(r'\1\2', revision)
 399
 400     # Apply any user-defined symbol transforms to the symbol name:
 401     name = self.transform_symbol(name, revision)
 402
 403     if name is not None:
 404       # Verify that the revision number is valid:
 405       if _valid_revision_re.match(revision):
 406         # The revision number is valid; record it for later processing:
 407         self._symbol_defs.append( (name, revision) )
 408       else:
 409         Log().warn(
 410             'In %r:\n'
 411             '    branch %r references invalid revision %s\n'
 412             '    and will be ignored.'
 413             % (self.cvs_file.filename, name, revision,)
 414             )
 415
 416   def _eliminate_trivial_duplicate_defs(self, symbol_defs):
 417     """Iterate through SYMBOL_DEFS, Removing identical duplicate definitions.
 418
 419     Duplicate definitions of symbol names have been seen in the wild,
 420     and they can also happen when --symbol-transform is used.  If a
 421     symbol is defined to the same revision number repeatedly, then
 422     ignore all but the last definition."""
 423
 424     # Make a copy, since we have to iterate through the definitions
 425     # twice:
 426     symbol_defs = list(symbol_defs)
 427
 428     # A map { (name, revision) : [index,...] } of the indexes where
 429     # symbol definitions name=revision were found:
 430     known_definitions = {}
 431     for (i, symbol_def) in enumerate(symbol_defs):
 432       known_definitions.setdefault(symbol_def, []).append(i)
 433
 434     # A set of the indexes of entries that have to be removed from
 435     # symbol_defs:
 436     dup_indexes = set()
 437     for ((name, revision), indexes) in known_definitions.iteritems():
 438       if len(indexes) > 1:
 439         Log().verbose(
 440             "in %r:\n"
 441             "   symbol %s:%s defined multiple times; ignoring duplicates\n"
 442             % (self.cvs_file.filename, name, revision,)
 443             )
 444         dup_indexes.update(indexes[:-1])
 445
 446     for (i, symbol_def) in enumerate(symbol_defs):
 447       if i not in dup_indexes:
 448         yield symbol_def
 449
 450   def _process_duplicate_defs(self, symbol_defs):
 451     """Iterate through SYMBOL_DEFS, processing duplicate names.
 452
 453     Duplicate definitions of symbol names have been seen in the wild,
 454     and they can also happen when --symbol-transform is used.  If a
 455     symbol is defined multiple times, then it is a fatal error.  This
 456     method should be called after _eliminate_trivial_duplicate_defs()."""
 457
 458     # Make a copy, since we have to access multiple times:
 459     symbol_defs = list(symbol_defs)
 460
 461     # A map {name : [index,...]} mapping the names of symbols to a
 462     # list of their definitions' indexes in symbol_defs:
 463     known_symbols = {}
 464     for (i, (name, revision)) in enumerate(symbol_defs):
 465       known_symbols.setdefault(name, []).append(i)
 466
 467     known_symbols = known_symbols.items()
 468     known_symbols.sort()
 469     dup_indexes = set()
 470     for (name, indexes) in known_symbols:
 471       if len(indexes) > 1:
 472         # This symbol was defined multiple times.
 473         self.collect_data.record_fatal_error(
 474             "Multiple definitions of the symbol '%s' in '%s': %s" % (
 475                 name, self.cvs_file.filename,
 476                 ' '.join([symbol_defs[i][1] for i in indexes]),
 477                 )
 478             )
 479         # Ignore all but the last definition for now, to allow the
 480         # conversion to proceed:
 481         dup_indexes.update(indexes[:-1])
 482
 483     for (i, symbol_def) in enumerate(symbol_defs):
 484       if i not in dup_indexes:
 485         yield symbol_def
 486
 487   def _process_symbol(self, name, revision):
 488     """Process a symbol called NAME, which is associated with REVISON.
 489
 490     REVISION is a canonical revision number with zeros removed, for
 491     example: '1.7', '1.7.2', or '1.1.1' or '1.1.1.1'.  NAME is a
 492     transformed branch or tag name."""
 493
 494     # Add symbol to our records:
 495     if is_branch_revision_number(revision):
 496       self._add_branch(name, revision)
 497     else:
 498       self._add_tag(name, revision)
 499
 500   def process_symbols(self):
 501     """Process the symbol definitions from SELF._symbol_defs."""
 502
 503     symbol_defs = self._symbol_defs
 504     del self._symbol_defs
 505
 506     symbol_defs = self._eliminate_trivial_duplicate_defs(symbol_defs)
 507     symbol_defs = self._process_duplicate_defs(symbol_defs)
 508
 509     for (name, revision) in symbol_defs:
 510       self._defined_symbols.add(name)
 511       self._process_symbol(name, revision)
 512
 513   @staticmethod
 514   def rev_to_branch_number(revision):
 515     """Return the branch_number of the branch on which REVISION lies.
 516
 517     REVISION is a branch revision number with an even number of
 518     components; for example '1.7.2.1' (never '1.7.2' nor '1.7.0.2').
 519     The return value is the branch number (for example, '1.7.2').
 520     Return none iff REVISION is a trunk revision such as '1.2'."""
 521
 522     if is_trunk_revision(revision):
 523       return None
 524     return revision[:revision.rindex(".")]
 525
 526   def rev_to_branch_data(self, revision):
 527     """Return the branch_data of the branch on which REVISION lies.
 528
 529     REVISION must be a branch revision number with an even number of
 530     components; for example '1.7.2.1' (never '1.7.2' nor '1.7.0.2').
 531     Raise KeyError iff REVISION is unknown."""
 532
 533     assert not is_trunk_revision(revision)
 534
 535     return self.branches_data[self.rev_to_branch_number(revision)]
 536
 537   def rev_to_lod(self, revision):
 538     """Return the line of development on which REVISION lies.
 539
 540     REVISION must be a revision number with an even number of
 541     components.  Raise KeyError iff REVISION is unknown."""
 542
 543     if is_trunk_revision(revision):
 544       return self.pdc.trunk
 545     else:
 546       return self.rev_to_branch_data(revision).symbol
 547
 548
 549 class _FileDataCollector(cvs2svn_rcsparse.Sink):
 550   """Class responsible for collecting RCS data for a particular file.
 551
 552   Any collected data that need to be remembered are stored into the
 553   referenced CollectData instance."""
 554
 555   def __init__(self, pdc, cvs_file):
 556     """Create an object that is prepared to receive data for CVS_FILE.
 557     CVS_FILE is a CVSFile instance.  COLLECT_DATA is used to store the
 558     information collected about the file."""
 559
 560     self.pdc = pdc
 561     self.cvs_file = cvs_file
 562
 563     self.collect_data = self.pdc.collect_data
 564     self.project = self.cvs_file.project
 565
 566     # A place to store information about the symbols in this file:
 567     self.sdc = _SymbolDataCollector(self, self.cvs_file)
 568
 569     # { revision : _RevisionData instance }
 570     self._rev_data = { }
 571
 572     # Lists [ (parent, child) ] of revision number pairs indicating
 573     # that revision child depends on revision parent along the main
 574     # line of development.
 575     self._primary_dependencies = []
 576
 577     # If set, this is an RCS branch number -- rcsparse calls this the
 578     # "principal branch", but CVS and RCS refer to it as the "default
 579     # branch", so that's what we call it, even though the rcsparse API
 580     # setter method is still 'set_principal_branch'.
 581     self.default_branch = None
 582
 583     # True iff revision 1.1 of the file appears to have been imported
 584     # (as opposed to added normally).
 585     self._file_imported = False
 586
 587   def _get_rev_id(self, revision):
 588     if revision is None:
 589       return None
 590     return self._rev_data[revision].cvs_rev_id
 591
 592   def set_principal_branch(self, branch):
 593     """This is a callback method declared in Sink."""
 594
 595     if branch.find('.') == -1:
 596       # This just sets the default branch to trunk.  Normally this
 597       # shouldn't occur, but it has been seen in at least one CVS
 598       # repository.  Just ignore it.
 599       pass
 600     else:
 601       self.default_branch = branch
 602
 603   def set_expansion(self, mode):
 604     """This is a callback method declared in Sink."""
 605
 606     self.cvs_file.mode = mode
 607
 608   def set_description(self, description):
 609     """This is a callback method declared in Sink."""
 610
 611     self.cvs_file.description = description
 612
 613   def define_tag(self, name, revision):
 614     """Remember the symbol name and revision, but don't process them yet.
 615
 616     This is a callback method declared in Sink."""
 617
 618     self.sdc.define_symbol(name, revision)
 619
 620   def admin_completed(self):
 621     """This is a callback method declared in Sink."""
 622
 623     self.sdc.process_symbols()
 624
 625   def define_revision(self, revision, timestamp, author, state,
 626                       branches, next):
 627     """This is a callback method declared in Sink."""
 628
 629     for branch in branches:
 630       try:
 631         branch_data = self.sdc.rev_to_branch_data(branch)
 632       except KeyError:
 633         # Normally we learn about the branches from the branch names
 634         # and numbers parsed from the symbolic name header.  But this
 635         # must have been an unlabeled branch that slipped through the
 636         # net.  Generate a name for it and create a _BranchData record
 637         # for it now.
 638         branch_data = self.sdc._add_unlabeled_branch(
 639             self.sdc.rev_to_branch_number(branch))
 640
 641       assert branch_data.child is None
 642       branch_data.child = branch
 643
 644     if revision in self._rev_data:
 645       # This revision has already been seen.
 646       Log().error('File %r contains duplicate definitions of revision %s.'
 647                   % (self.cvs_file.filename, revision,))
 648       raise RuntimeError
 649
 650     # Record basic information about the revision:
 651     rev_data = _RevisionData(
 652         self.collect_data.item_key_generator.gen_id(),
 653         revision, int(timestamp), author, state)
 654     self._rev_data[revision] = rev_data
 655
 656     # When on trunk, the RCS 'next' revision number points to what
 657     # humans might consider to be the 'previous' revision number.  For
 658     # example, 1.3's RCS 'next' is 1.2.
 659     #
 660     # However, on a branch, the RCS 'next' revision number really does
 661     # point to what humans would consider to be the 'next' revision
 662     # number.  For example, 1.1.2.1's RCS 'next' would be 1.1.2.2.
 663     #
 664     # In other words, in RCS, 'next' always means "where to find the next
 665     # deltatext that you need this revision to retrieve.
 666     #
 667     # That said, we don't *want* RCS's behavior here, so we determine
 668     # whether we're on trunk or a branch and set the dependencies
 669     # accordingly.
 670     if next:
 671       if is_trunk_revision(revision):
 672         self._primary_dependencies.append( (next, revision,) )
 673       else:
 674         self._primary_dependencies.append( (revision, next,) )
 675
 676   def _resolve_primary_dependencies(self):
 677     """Resolve the dependencies listed in self._primary_dependencies."""
 678
 679     for (parent, child,) in self._primary_dependencies:
 680       parent_data = self._rev_data[parent]
 681       assert parent_data.child is None
 682       parent_data.child = child
 683
 684       child_data = self._rev_data[child]
 685       assert child_data.parent is None
 686       child_data.parent = parent
 687
 688   def _resolve_branch_dependencies(self):
 689     """Resolve dependencies involving branches."""
 690
 691     for branch_data in self.sdc.branches_data.values():
 692       # The branch_data's parent has the branch as a child regardless
 693       # of whether the branch had any subsequent commits:
 694       try:
 695         parent_data = self._rev_data[branch_data.parent]
 696       except KeyError:
 697         Log().warn(
 698             'In %r:\n'
 699             '    branch %r references non-existing revision %s\n'
 700             '    and will be ignored.'
 701             % (self.cvs_file.filename, branch_data.symbol.name,
 702                branch_data.parent,))
 703         del self.sdc.branches_data[branch_data.branch_number]
 704       else:
 705         parent_data.branches_data.append(branch_data)
 706
 707         # If the branch has a child (i.e., something was committed on
 708         # the branch), then we store a reference to the branch_data
 709         # there, define the child's parent to be the branch's parent,
 710         # and list the child in the branch parent's branches_revs_data:
 711         if branch_data.child is not None:
 712           child_data = self._rev_data[branch_data.child]
 713           assert child_data.parent_branch_data is None
 714           child_data.parent_branch_data = branch_data
 715           assert child_data.parent is None
 716           child_data.parent = branch_data.parent
 717           parent_data.branches_revs_data.append(branch_data.child)
 718
 719   def _sort_branches(self):
 720     """Sort the branches sprouting from each revision in creation order.
 721
 722     Creation order is taken to be the reverse of the order that they
 723     are listed in the symbols part of the RCS file.  (If a branch is
 724     created then deleted, a later branch can be assigned the recycled
 725     branch number; therefore branch numbers are not an indication of
 726     creation order.)"""
 727
 728     for rev_data in self._rev_data.values():
 729       rev_data.branches_data.sort(lambda a, b: - cmp(a.id, b.id))
 730
 731   def _resolve_tag_dependencies(self):
 732     """Resolve dependencies involving tags."""
 733
 734     for (rev, tag_data_list) in self.sdc.tags_data.items():
 735       try:
 736         parent_data = self._rev_data[rev]
 737       except KeyError:
 738         Log().warn(
 739             'In %r:\n'
 740             '    the following tag(s) reference non-existing revision %s\n'
 741             '    and will be ignored:\n'
 742             '    %s' % (
 743                 self.cvs_file.filename, rev,
 744                 ', '.join([repr(tag_data.symbol.name)
 745                            for tag_data in tag_data_list]),))
 746         del self.sdc.tags_data[rev]
 747       else:
 748         for tag_data in tag_data_list:
 749           assert tag_data.rev == rev
 750           # The tag_data's rev has the tag as a child:
 751           parent_data.tags_data.append(tag_data)
 752
 753   def _determine_operation(self, rev_data):
 754     prev_rev_data = self._rev_data.get(rev_data.parent)
 755     return cvs_revision_type_map[(
 756         rev_data.state != 'dead',
 757         prev_rev_data is not None and prev_rev_data.state != 'dead',
 758         )]
 759
 760   def _get_cvs_revision(self, rev_data):
 761     """Create and return a CVSRevision for REV_DATA."""
 762
 763     branch_ids = [
 764         branch_data.id
 765         for branch_data in rev_data.branches_data
 766         ]
 767
 768     branch_commit_ids = [
 769         self._get_rev_id(rev)
 770         for rev in rev_data.branches_revs_data
 771         ]
 772
 773     tag_ids = [
 774         tag_data.id
 775         for tag_data in rev_data.tags_data
 776         ]
 777
 778     revision_type = self._determine_operation(rev_data)
 779
 780     return revision_type(
 781         self._get_rev_id(rev_data.rev), self.cvs_file,
 782         rev_data.timestamp, None,
 783         self._get_rev_id(rev_data.parent),
 784         self._get_rev_id(rev_data.child),
 785         rev_data.rev,
 786         True,
 787         self.sdc.rev_to_lod(rev_data.rev),
 788         rev_data.get_first_on_branch_id(),
 789         False, None, None,
 790         tag_ids, branch_ids, branch_commit_ids,
 791         rev_data.revision_recorder_token)
 792
 793   def _get_cvs_revisions(self):
 794     """Generate the CVSRevisions present in this file."""
 795
 796     for rev_data in self._rev_data.itervalues():
 797       yield self._get_cvs_revision(rev_data)
 798
 799   def _get_cvs_branches(self):
 800     """Generate the CVSBranches present in this file."""
 801
 802     for branch_data in self.sdc.branches_data.values():
 803       yield CVSBranch(
 804           branch_data.id, self.cvs_file, branch_data.symbol,
 805           branch_data.branch_number,
 806           self.sdc.rev_to_lod(branch_data.parent),
 807           self._get_rev_id(branch_data.parent),
 808           self._get_rev_id(branch_data.child),
 809           None,
 810           )
 811
 812   def _get_cvs_tags(self):
 813     """Generate the CVSTags present in this file."""
 814
 815     for tags_data in self.sdc.tags_data.values():
 816       for tag_data in tags_data:
 817         yield CVSTag(
 818             tag_data.id, self.cvs_file, tag_data.symbol,
 819             self.sdc.rev_to_lod(tag_data.rev),
 820             self._get_rev_id(tag_data.rev),
 821             None,
 822             )
 823
 824   def tree_completed(self):
 825     """The revision tree has been parsed.
 826
 827     Analyze it for consistency and connect some loose ends.
 828
 829     This is a callback method declared in Sink."""
 830
 831     self._resolve_primary_dependencies()
 832     self._resolve_branch_dependencies()
 833     self._sort_branches()
 834     self._resolve_tag_dependencies()
 835
 836     # Compute the preliminary CVSFileItems for this file:
 837     cvs_items = []
 838     cvs_items.extend(self._get_cvs_revisions())
 839     cvs_items.extend(self._get_cvs_branches())
 840     cvs_items.extend(self._get_cvs_tags())
 841     self._cvs_file_items = CVSFileItems(
 842         self.cvs_file, self.pdc.trunk, cvs_items
 843         )
 844
 845     self._cvs_file_items.check_link_consistency()
 846
 847     # Tell the revision recorder about the file dependency tree.
 848     self.collect_data.revision_recorder.start_file(self._cvs_file_items)
 849
 850   def set_revision_info(self, revision, log, text):
 851     """This is a callback method declared in Sink."""
 852
 853     rev_data = self._rev_data[revision]
 854     cvs_rev = self._cvs_file_items[rev_data.cvs_rev_id]
 855
 856     if cvs_rev.metadata_id is not None:
 857       # Users have reported problems with repositories in which the
 858       # deltatext block for revision 1.1 appears twice.  It is not
 859       # known whether this results from a CVS/RCS bug, or from botched
 860       # hand-editing of the repository.  In any case, empirically, cvs
 861       # and rcs both use the first version when checking out data, so
 862       # that's what we will do.  (For the record: "cvs log" fails on
 863       # such a file; "rlog" prints the log message from the first
 864       # block and ignores the second one.)
 865       Log().warn(
 866           "%s: in '%s':\n"
 867           "   Deltatext block for revision %s appeared twice;\n"
 868           "   ignoring the second occurrence.\n"
 869           % (warning_prefix, self.cvs_file.filename, revision,)
 870           )
 871       return
 872
 873     if is_trunk_revision(revision):
 874       branch_name = None
 875     else:
 876       branch_name = self.sdc.rev_to_branch_data(revision).symbol.name
 877
 878     cvs_rev.metadata_id = self.collect_data.metadata_logger.store(
 879         self.project, branch_name, rev_data.author, log
 880         )
 881     cvs_rev.deltatext_exists = bool(text)
 882
 883     # If this is revision 1.1, determine whether the file appears to
 884     # have been created via 'cvs add' instead of 'cvs import'.  The
 885     # test is that the log message CVS uses for 1.1 in imports is
 886     # "Initial revision\n" with no period.  (This fact helps determine
 887     # whether this file might have had a default branch in the past.)
 888     if revision == '1.1':
 889       self._file_imported = (log == 'Initial revision\n')
 890
 891     cvs_rev.revision_recorder_token = \
 892         self.collect_data.revision_recorder.record_text(cvs_rev, log, text)
 893
 894   def parse_completed(self):
 895     """Finish the processing of this file.
 896
 897     This is a callback method declared in Sink."""
 898
 899     # Make sure that there was an info section for each revision:
 900     for cvs_item in self._cvs_file_items.values():
 901       if isinstance(cvs_item, CVSRevision) and cvs_item.metadata_id is None:
 902         self.collect_data.record_fatal_error(
 903             '%r has no deltatext section for revision %s'
 904             % (self.cvs_file.filename, cvs_item.rev,)
 905             )
 906
 907   def _process_ntdbrs(self):
 908     """Fix up any non-trunk default branch revisions (if present).
 909
 910     If a non-trunk default branch is determined to have existed, yield
 911     the _RevisionData.ids for all revisions that were once non-trunk
 912     default revisions, in dependency order.
 913
 914     There are two cases to handle:
 915
 916     One case is simple.  The RCS file lists a default branch
 917     explicitly in its header, such as '1.1.1'.  In this case, we know
 918     that every revision on the vendor branch is to be treated as head
 919     of trunk at that point in time.
 920
 921     But there's also a degenerate case.  The RCS file does not
 922     currently have a default branch, yet we can deduce that for some
 923     period in the past it probably *did* have one.  For example, the
 924     file has vendor revisions 1.1.1.1 -> 1.1.1.96, all of which are
 925     dated before 1.2, and then it has 1.1.1.97 -> 1.1.1.100 dated
 926     after 1.2.  In this case, we should record 1.1.1.96 as the last
 927     vendor revision to have been the head of the default branch.
 928
 929     If any non-trunk default branch revisions are found:
 930
 931     - Set their ntdbr members to True.
 932
 933     - Connect the last one with revision 1.2.
 934
 935     - Remove revision 1.1 if it is not needed.
 936
 937     """
 938
 939     try:
 940       if self.default_branch:
 941         vendor_cvs_branch_id = self.sdc.branches_data[self.default_branch].id
 942         vendor_lod_items = self._cvs_file_items.get_lod_items(
 943             self._cvs_file_items[vendor_cvs_branch_id]
 944             )
 945         if not self._cvs_file_items.process_live_ntdb(vendor_lod_items):
 946           return
 947       elif self._file_imported:
 948         vendor_branch_data = self.sdc.branches_data.get('1.1.1')
 949         if vendor_branch_data is None:
 950           return
 951         else:
 952           vendor_lod_items = self._cvs_file_items.get_lod_items(
 953               self._cvs_file_items[vendor_branch_data.id]
 954               )
 955           if not self._cvs_file_items.process_historical_ntdb(
 956                 vendor_lod_items
 957                 ):
 958             return
 959       else:
 960         return
 961     except VendorBranchError, e:
 962       self.collect_data.record_fatal_error(str(e))
 963       return
 964
 965     if self._file_imported:
 966       self._cvs_file_items.imported_remove_1_1(vendor_lod_items)
 967
 968     self._cvs_file_items.check_link_consistency()
 969
 970   def get_cvs_file_items(self):
 971     """Finish up and return a CVSFileItems instance for this file.
 972
 973     This method must only be called once."""
 974
 975     self._process_ntdbrs()
 976
 977     # Break a circular reference loop, allowing the memory for self
 978     # and sdc to be freed.
 979     del self.sdc
 980
 981     return self._cvs_file_items
 982
 983
 984 class _ProjectDataCollector:
 985   def __init__(self, collect_data, project):
 986     self.collect_data = collect_data
 987     self.project = project
 988     self.num_files = 0
 989
 990     # The Trunk LineOfDevelopment object for this project:
 991     self.trunk = Trunk(
 992         self.collect_data.symbol_key_generator.gen_id(), self.project
 993         )
 994     self.project.trunk_id = self.trunk.id
 995
 996     # This causes a record for self.trunk to spring into existence:
 997     self.collect_data.register_trunk(self.trunk)
 998
 999     # A map { name -> Symbol } for all known symbols in this project.
1000     # The symbols listed here are undifferentiated into Branches and
1001     # Tags because the same name might appear as a branch in one file
1002     # and a tag in another.
1003     self.symbols = {}
1004
1005     # A map { (old_name, new_name) : count } indicating how many files
1006     # were affected by each each symbol name transformation:
1007     self.symbol_transform_counts = {}
1008
1009   def get_symbol(self, name):
1010     """Return the Symbol object for the symbol named NAME in this project.
1011
1012     If such a symbol does not yet exist, allocate a new symbol_id,
1013     create a Symbol instance, store it in self.symbols, and return it."""
1014
1015     symbol = self.symbols.get(name)
1016     if symbol is None:
1017       symbol = Symbol(
1018           self.collect_data.symbol_key_generator.gen_id(),
1019           self.project, name)
1020       self.symbols[name] = symbol
1021     return symbol
1022
1023   def log_symbol_transform(self, old_name, new_name):
1024     """Record that OLD_NAME was transformed to NEW_NAME in one file.
1025
1026     This information is used to generated a statistical summary of
1027     symbol transforms."""
1028
1029     try:
1030       self.symbol_transform_counts[old_name, new_name] += 1
1031     except KeyError:
1032       self.symbol_transform_counts[old_name, new_name] = 1
1033
1034   def summarize_symbol_transforms(self):
1035     if self.symbol_transform_counts and Log().is_on(Log.NORMAL):
1036       log = Log()
1037       log.normal('Summary of symbol transforms:')
1038       transforms = self.symbol_transform_counts.items()
1039       transforms.sort()
1040       for ((old_name, new_name), count) in transforms:
1041         if new_name is None:
1042           log.normal('    "%s" ignored in %d files' % (old_name, count,))
1043         else:
1044           log.normal(
1045               '    "%s" transformed to "%s" in %d files'
1046               % (old_name, new_name, count,)
1047               )
1048
1049   def process_file(self, cvs_file):
1050     Log().normal(cvs_file.filename)
1051     fdc = _FileDataCollector(self, cvs_file)
1052     try:
1053       cvs2svn_rcsparse.parse(open(cvs_file.filename, 'rb'), fdc)
1054     except (cvs2svn_rcsparse.common.RCSParseError, ValueError, RuntimeError):
1055       self.collect_data.record_fatal_error(
1056           "%r is not a valid ,v file" % (cvs_file.filename,)
1057           )
1058       # Abort the processing of this file, but let the pass continue
1059       # with other files:
1060       return
1061     except:
1062       Log().warn("Exception occurred while parsing %s" % cvs_file.filename)
1063       raise
1064     else:
1065       self.num_files += 1
1066
1067     return fdc.get_cvs_file_items()
1068
1069
1070 class CollectData:
1071   """Repository for data collected by parsing the CVS repository files.
1072
1073   This class manages the databases into which information collected
1074   from the CVS repository is stored.  The data are stored into this
1075   class by _FileDataCollector instances, one of which is created for
1076   each file to be parsed."""
1077
1078   def __init__(self, revision_recorder, stats_keeper):
1079     self.revision_recorder = revision_recorder
1080     self._cvs_item_store = NewCVSItemStore(
1081         artifact_manager.get_temp_file(config.CVS_ITEMS_STORE))
1082     self.metadata_db = MetadataDatabase(
1083         artifact_manager.get_temp_file(config.METADATA_STORE),
1084         artifact_manager.get_temp_file(config.METADATA_INDEX_TABLE),
1085         DB_OPEN_NEW,
1086         )
1087     self.metadata_logger = MetadataLogger(self.metadata_db)
1088     self.fatal_errors = []
1089     self.num_files = 0
1090     self.symbol_stats = SymbolStatisticsCollector()
1091     self.stats_keeper = stats_keeper
1092
1093     # Key generator for CVSFiles:
1094     self.file_key_generator = KeyGenerator()
1095
1096     # Key generator for CVSItems:
1097     self.item_key_generator = KeyGenerator()
1098
1099     # Key generator for Symbols:
1100     self.symbol_key_generator = KeyGenerator()
1101
1102     self.revision_recorder.start()
1103
1104   def record_fatal_error(self, err):
1105     """Record that fatal error ERR was found.
1106
1107     ERR is a string (without trailing newline) describing the error.
1108     Output the error to stderr immediately, and record a copy to be
1109     output again in a summary at the end of CollectRevsPass."""
1110
1111     err = '%s: %s' % (error_prefix, err,)
1112     Log().error(err + '\n')
1113     self.fatal_errors.append(err)
1114
1115   def add_cvs_directory(self, cvs_directory):
1116     """Record CVS_DIRECTORY."""
1117
1118     Ctx()._cvs_path_db.log_path(cvs_directory)
1119
1120   def add_cvs_file_items(self, cvs_file_items):
1121     """Record the information from CVS_FILE_ITEMS.
1122
1123     Store the CVSFile to _cvs_path_db under its persistent id, store
1124     the CVSItems, and record the CVSItems to self.stats_keeper."""
1125
1126     Ctx()._cvs_path_db.log_path(cvs_file_items.cvs_file)
1127     self._cvs_item_store.add(cvs_file_items)
1128
1129     self.stats_keeper.record_cvs_file(cvs_file_items.cvs_file)
1130     for cvs_item in cvs_file_items.values():
1131       self.stats_keeper.record_cvs_item(cvs_item)
1132
1133   def register_trunk(self, trunk):
1134     """Create a symbol statistics record for the specified trunk LOD."""
1135
1136     # This causes a record to spring into existence:
1137     self.symbol_stats[trunk]
1138
1139   def _process_cvs_file_items(self, cvs_file_items):
1140     """Process the CVSFileItems from one CVSFile."""
1141
1142     # Remove an initial delete on trunk if it is not needed:
1143     cvs_file_items.remove_unneeded_initial_trunk_delete(self.metadata_db)
1144
1145     # Remove initial branch deletes that are not needed:
1146     cvs_file_items.remove_initial_branch_deletes(self.metadata_db)
1147
1148     # If this is a --trunk-only conversion, discard all branches and
1149     # tags, then draft any non-trunk default branch revisions to
1150     # trunk:
1151     if Ctx().trunk_only:
1152       cvs_file_items.exclude_non_trunk()
1153
1154     cvs_file_items.check_link_consistency()
1155
1156     self.revision_recorder.finish_file(cvs_file_items)
1157     self.add_cvs_file_items(cvs_file_items)
1158     self.symbol_stats.register(cvs_file_items)
1159
1160   def process_project(self, project):
1161     Ctx()._projects[project.id] = project
1162
1163     pdc = _ProjectDataCollector(self, project)
1164
1165     found_rcs_file = False
1166     for cvs_path in walk_repository(
1167           project, self.file_key_generator, self.record_fatal_error
1168           ):
1169       if isinstance(cvs_path, CVSDirectory):
1170         self.add_cvs_directory(cvs_path)
1171       else:
1172         cvs_file_items = pdc.process_file(cvs_path)
1173         self._process_cvs_file_items(cvs_file_items)
1174         found_rcs_file = True
1175
1176     if not found_rcs_file:
1177       self.record_fatal_error(
1178           'No RCS files found under %r!\n'
1179           'Are you absolutely certain you are pointing cvs2svn\n'
1180           'at a CVS repository?\n'
1181           % (project.project_cvs_repos_path,)
1182           )
1183
1184     pdc.summarize_symbol_transforms()
1185
1186     self.num_files += pdc.num_files
1187     Log().verbose('Processed', self.num_files, 'files')
1188
1189   def _set_cvs_path_ordinals(self):
1190     cvs_files = list(Ctx()._cvs_path_db.itervalues())
1191     cvs_files.sort(CVSPath.slow_compare)
1192     for (i, cvs_file) in enumerate(cvs_files):
1193       cvs_file.ordinal = i
1194
1195   def close(self):
1196     """Close the data structures associated with this instance.
1197
1198     Return a list of fatal errors encountered while processing input.
1199     Each list entry is a string describing one fatal error."""
1200
1201     self.revision_recorder.finish()
1202     self.symbol_stats.purge_ghost_symbols()
1203     self.symbol_stats.close()
1204     self.symbol_stats = None
1205     self.metadata_logger = None
1206     self.metadata_db.close()
1207     self.metadata_db = None
1208     self._cvs_item_store.close()
1209     self._cvs_item_store = None
1210     self._set_cvs_path_ordinals()
1211     self.revision_recorder = None
1212     retval = self.fatal_errors
1213     self.fatal_errors = None
1214     return retval
1215
1216