cvs2svn_lib/collect_data.py

   1 # (Be in -*- python -*- mode.)
   2 #
   3 # ====================================================================
   4 # Copyright (c) 2000-2009 CollabNet.  All rights reserved.
   5 #
   6 # This software is licensed as described in the file COPYING, which
   7 # you should have received as part of this distribution.  The terms
   8 # are also available at http://subversion.tigris.org/license-1.html.
   9 # If newer versions of this license are posted there, you may use a
  10 # newer version instead, at your option.
  11 #
  12 # This software consists of voluntary contributions made by many
  13 # individuals.  For exact contribution history, see the revision
  14 # history and logs, available at http://cvs2svn.tigris.org/.
  15 # ====================================================================
  16
  17 """Data collection classes.
  18
  19 This module contains the code used to collect data from the CVS
  20 repository.  It parses *,v files, recording all useful information
  21 except for the actual file contents.
  22
  23 As a *,v file is parsed, the information pertaining to the file is
  24 accumulated in memory, mostly in _RevisionData, _BranchData, and
  25 _TagData objects.  When parsing is complete, a final pass is made over
  26 the data to create some final dependency links, collect statistics,
  27 etc., then the _*Data objects are converted into CVSItem objects
  28 (CVSRevision, CVSBranch, and CVSTag respectively) and the CVSItems are
  29 dumped into databases.
  30
  31 During the data collection, persistent unique ids are allocated to
  32 many types of objects: CVSFile, Symbol, and CVSItems.  CVSItems are a
  33 special case.  CVSItem ids are unique across all CVSItem types, and
  34 the ids are carried over from the corresponding data collection
  35 objects:
  36
  37     _RevisionData -> CVSRevision
  38
  39     _BranchData -> CVSBranch
  40
  41     _TagData -> CVSTag
  42
  43 In a later pass it is possible to convert tags <-> branches.  But even
  44 if this occurs, the new branch or tag uses the same id as the old tag
  45 or branch.
  46
  47 """
  48
  49
  50 import re
  51
  52 from cvs2svn_lib import config
  53 from cvs2svn_lib.common import DB_OPEN_NEW
  54 from cvs2svn_lib.common import warning_prefix
  55 from cvs2svn_lib.common import error_prefix
  56 from cvs2svn_lib.common import is_trunk_revision
  57 from cvs2svn_lib.common import is_branch_revision_number
  58 from cvs2svn_lib.log import logger
  59 from cvs2svn_lib.context import Ctx
  60 from cvs2svn_lib.artifact_manager import artifact_manager
  61 from cvs2svn_lib.cvs_path import CVSFile
  62 from cvs2svn_lib.cvs_path import CVSDirectory
  63 from cvs2svn_lib.symbol import Symbol
  64 from cvs2svn_lib.symbol import Trunk
  65 from cvs2svn_lib.cvs_item import CVSRevision
  66 from cvs2svn_lib.cvs_item import CVSBranch
  67 from cvs2svn_lib.cvs_item import CVSTag
  68 from cvs2svn_lib.cvs_item import cvs_revision_type_map
  69 from cvs2svn_lib.cvs_file_items import VendorBranchError
  70 from cvs2svn_lib.cvs_file_items import CVSFileItems
  71 from cvs2svn_lib.key_generator import KeyGenerator
  72 from cvs2svn_lib.cvs_item_database import NewCVSItemStore
  73 from cvs2svn_lib.symbol_statistics import SymbolStatisticsCollector
  74 from cvs2svn_lib.metadata_database import MetadataDatabase
  75 from cvs2svn_lib.metadata_database import MetadataLogger
  76
  77 import cvs2svn_rcsparse
  78
  79
  80 # A regular expression defining "valid" revision numbers (used to
  81 # check that symbol definitions are reasonable).
  82 _valid_revision_re = re.compile(r'''
  83     ^
  84     (?:\d+\.)+          # Digit groups with trailing dots
  85     \d+                 # And the last digit group.
  86     $
  87     ''', re.VERBOSE)
  88
  89 _branch_revision_re = re.compile(r'''
  90     ^
  91     ((?:\d+\.\d+\.)+)   # A nonzero even number of digit groups w/trailing dot
  92     (?:0\.)?            # CVS sticks an extra 0 here; RCS does not
  93     (\d+)               # And the last digit group
  94     $
  95     ''', re.VERBOSE)
  96
  97
  98 def is_same_line_of_development(rev1, rev2):
  99   """Return True if rev1 and rev2 are on the same line of
 100   development (i.e., both on trunk, or both on the same branch);
 101   return False otherwise.  Either rev1 or rev2 can be None, in
 102   which case automatically return False."""
 103
 104   if rev1 is None or rev2 is None:
 105     return False
 106   if is_trunk_revision(rev1) and is_trunk_revision(rev2):
 107     # Trunk revisions have to be handled specially because the main
 108     # trunk version number can be changed; e.g., from 1 to 2.
 109     return True
 110   if rev1[0:rev1.rfind('.')] == rev2[0:rev2.rfind('.')]:
 111     return True
 112   return False
 113
 114
 115 class _RevisionData:
 116   """We track the state of each revision so that in set_revision_info,
 117   we can determine if our op is an add/change/delete.  We can do this
 118   because in set_revision_info, we'll have all of the _RevisionData
 119   for a file at our fingertips, and we need to examine the state of
 120   our prev_rev to determine if we're an add or a change.  Without the
 121   state of the prev_rev, we are unable to distinguish between an add
 122   and a change."""
 123
 124   def __init__(self, cvs_rev_id, rev, timestamp, author, state):
 125     # The id of this revision:
 126     self.cvs_rev_id = cvs_rev_id
 127     self.rev = rev
 128     self.timestamp = timestamp
 129     self.author = author
 130     self.state = state
 131
 132     # If this is the first revision on a branch, then this is the
 133     # branch_data of that branch; otherwise it is None.
 134     self.parent_branch_data = None
 135
 136     # The revision number of the parent of this revision along the
 137     # same line of development, if any.  For the first revision R on a
 138     # branch, we consider the revision from which R sprouted to be the
 139     # 'parent'.  If this is the root revision in the file's revision
 140     # tree, then this field is None.
 141     #
 142     # Note that this revision can't be determined arithmetically (due
 143     # to cvsadmin -o), which is why this field is necessary.
 144     self.parent = None
 145
 146     # The revision number of the primary child of this revision (the
 147     # child along the same line of development), if any; otherwise,
 148     # None.
 149     self.child = None
 150
 151     # The _BranchData instances of branches that sprout from this
 152     # revision, sorted in ascending order by branch number.  It would
 153     # be inconvenient to initialize it here because we would have to
 154     # scan through all branches known by the _SymbolDataCollector to
 155     # find the ones having us as the parent.  Instead, this
 156     # information is filled in by
 157     # _FileDataCollector._resolve_dependencies() and sorted by
 158     # _FileDataCollector._sort_branches().
 159     self.branches_data = []
 160
 161     # The revision numbers of the first commits on any branches on
 162     # which commits occurred.  This dependency is kept explicitly
 163     # because otherwise a revision-only topological sort would miss
 164     # the dependency that exists via branches_data.
 165     self.branches_revs_data = []
 166
 167     # The _TagData instances of tags that are connected to this
 168     # revision.
 169     self.tags_data = []
 170
 171     # A token that may be set by a RevisionCollector, then used by
 172     # RevisionReader to obtain the text again.
 173     self.revision_reader_token = None
 174
 175   def get_first_on_branch_id(self):
 176     return self.parent_branch_data and self.parent_branch_data.id
 177
 178
 179 class _SymbolData:
 180   """Collection area for information about a symbol in a single CVSFile.
 181
 182   SYMBOL is an instance of Symbol, undifferentiated as a Branch or a
 183   Tag regardless of whether self is a _BranchData or a _TagData."""
 184
 185   def __init__(self, id, symbol):
 186     """Initialize an object for SYMBOL."""
 187
 188     # The unique id that will be used for this particular symbol in
 189     # this particular file.  This same id will be used for the CVSItem
 190     # that is derived from this instance.
 191     self.id = id
 192
 193     # An instance of Symbol.
 194     self.symbol = symbol
 195
 196
 197 class _BranchData(_SymbolData):
 198   """Collection area for information about a Branch in a single CVSFile."""
 199
 200   def __init__(self, id, symbol, branch_number):
 201     _SymbolData.__init__(self, id, symbol)
 202
 203     # The branch number (e.g., '1.5.2') of this branch.
 204     self.branch_number = branch_number
 205
 206     # The revision number of the revision from which this branch
 207     # sprouts (e.g., '1.5').
 208     self.parent = self.branch_number[:self.branch_number.rindex(".")]
 209
 210     # The revision number of the first commit on this branch, if any
 211     # (e.g., '1.5.2.1'); otherwise, None.
 212     self.child = None
 213
 214
 215 class _TagData(_SymbolData):
 216   """Collection area for information about a Tag in a single CVSFile."""
 217
 218   def __init__(self, id, symbol, rev):
 219     _SymbolData.__init__(self, id, symbol)
 220
 221     # The revision number being tagged (e.g., '1.5.2.3').
 222     self.rev = rev
 223
 224
 225 class _SymbolDataCollector(object):
 226   """Collect information about symbols in a single CVSFile."""
 227
 228   def __init__(self, fdc, cvs_file):
 229     self.fdc = fdc
 230     self.cvs_file = cvs_file
 231
 232     self.pdc = self.fdc.pdc
 233     self.collect_data = self.fdc.collect_data
 234
 235     # A list [(name, revision), ...] of symbols defined in the header
 236     # of the file.  The name has already been transformed using the
 237     # symbol transform rules.  If the symbol transform rules indicate
 238     # that the symbol should be ignored, then it is never added to
 239     # this list.  This list is processed then deleted in
 240     # process_symbols().
 241     self._symbol_defs = []
 242
 243     # A set containing the transformed names of symbols in this file
 244     # (used to detect duplicates during processing of unlabeled
 245     # branches):
 246     self._defined_symbols = set()
 247
 248     # Map { branch_number : _BranchData }, where branch_number has an
 249     # odd number of digits.
 250     self.branches_data = { }
 251
 252     # Map { revision : [ tag_data ] }, where revision has an even
 253     # number of digits, and the value is a list of _TagData objects
 254     # for tags that apply to that revision.
 255     self.tags_data = { }
 256
 257   def _add_branch(self, name, branch_number):
 258     """Record that BRANCH_NUMBER is the branch number for branch NAME,
 259     and derive and record the revision from which NAME sprouts.
 260     BRANCH_NUMBER is an RCS branch number with an odd number of
 261     components, for example '1.7.2' (never '1.7.0.2').  Return the
 262     _BranchData instance (which is usually newly-created)."""
 263
 264     branch_data = self.branches_data.get(branch_number)
 265
 266     if branch_data is not None:
 267       logger.warn(
 268           "%s: in '%s':\n"
 269           "   branch '%s' already has name '%s',\n"
 270           "   cannot also have name '%s', ignoring the latter\n"
 271           % (warning_prefix,
 272              self.cvs_file.rcs_path, branch_number,
 273              branch_data.symbol.name, name)
 274           )
 275       return branch_data
 276
 277     symbol = self.pdc.get_symbol(name)
 278     branch_data = _BranchData(
 279         self.collect_data.item_key_generator.gen_id(), symbol, branch_number
 280         )
 281     self.branches_data[branch_number] = branch_data
 282     return branch_data
 283
 284   def _construct_distinct_name(self, name, original_name):
 285     """Construct a distinct symbol name from NAME.
 286
 287     If NAME is distinct, return it.  If it is already used in this
 288     file (as determined from its presence in self._defined_symbols),
 289     construct and return a new name that is not already used."""
 290
 291     if name not in self._defined_symbols:
 292       return name
 293     else:
 294       index = 1
 295       while True:
 296         dup_name = '%s-DUPLICATE-%d' % (name, index,)
 297         if dup_name not in self._defined_symbols:
 298           self.collect_data.record_fatal_error(
 299               "Symbol name '%s' is already used in '%s'.\n"
 300               "The unlabeled branch '%s' must be renamed using "
 301               "--symbol-transform."
 302               % (name, self.cvs_file.rcs_path, original_name,)
 303               )
 304           return dup_name
 305
 306   def _add_unlabeled_branch(self, branch_number):
 307     original_name = "unlabeled-" + branch_number
 308     name = self.transform_symbol(original_name, branch_number)
 309     if name is None:
 310       self.collect_data.record_fatal_error(
 311           "The unlabeled branch '%s' in '%s' contains commits.\n"
 312           "It may not be ignored via a symbol transform.  (Use --exclude "
 313           "instead.)"
 314           % (original_name, self.cvs_file.rcs_path,)
 315           )
 316       # Retain the original name to allow the conversion to continue:
 317       name = original_name
 318
 319     distinct_name = self._construct_distinct_name(name, original_name)
 320     self._defined_symbols.add(distinct_name)
 321     return self._add_branch(distinct_name, branch_number)
 322
 323   def _add_tag(self, name, revision):
 324     """Record that tag NAME refers to the specified REVISION."""
 325
 326     symbol = self.pdc.get_symbol(name)
 327     tag_data = _TagData(
 328         self.collect_data.item_key_generator.gen_id(), symbol, revision
 329         )
 330     self.tags_data.setdefault(revision, []).append(tag_data)
 331     return tag_data
 332
 333   def transform_symbol(self, name, revision):
 334     """Transform a symbol according to the project's symbol transforms.
 335
 336     Transform the symbol with the original name NAME and canonicalized
 337     revision number REVISION.  Return the new symbol name or None if
 338     the symbol should be ignored entirely.
 339
 340     Log the results of the symbol transform if necessary."""
 341
 342     old_name = name
 343     # Apply any user-defined symbol transforms to the symbol name:
 344     name = self.cvs_file.project.transform_symbol(
 345         self.cvs_file, name, revision
 346         )
 347
 348     if name is None:
 349       # Ignore symbol:
 350       self.pdc.log_symbol_transform(old_name, None)
 351       logger.verbose(
 352           "   symbol '%s'=%s ignored in %s"
 353           % (old_name, revision, self.cvs_file.rcs_path,)
 354           )
 355     else:
 356       if name != old_name:
 357         self.pdc.log_symbol_transform(old_name, name)
 358         logger.verbose(
 359             "   symbol '%s'=%s transformed to '%s' in %s"
 360             % (old_name, revision, name, self.cvs_file.rcs_path,)
 361             )
 362
 363     return name
 364
 365   def define_symbol(self, name, revision):
 366     """Record a symbol definition for later processing."""
 367
 368     # Canonicalize the revision number:
 369     revision = _branch_revision_re.sub(r'\1\2', revision)
 370
 371     # Apply any user-defined symbol transforms to the symbol name:
 372     name = self.transform_symbol(name, revision)
 373
 374     if name is not None:
 375       # Verify that the revision number is valid:
 376       if _valid_revision_re.match(revision):
 377         # The revision number is valid; record it for later processing:
 378         self._symbol_defs.append( (name, revision) )
 379       else:
 380         logger.warn(
 381             'In %r:\n'
 382             '    branch %r references invalid revision %s\n'
 383             '    and will be ignored.'
 384             % (self.cvs_file.rcs_path, name, revision,)
 385             )
 386
 387   def _eliminate_trivial_duplicate_defs(self, symbol_defs):
 388     """Iterate through SYMBOL_DEFS, Removing identical duplicate definitions.
 389
 390     Duplicate definitions of symbol names have been seen in the wild,
 391     and they can also happen when --symbol-transform is used.  If a
 392     symbol is defined to the same revision number repeatedly, then
 393     ignore all but the last definition."""
 394
 395     # Make a copy, since we have to iterate through the definitions
 396     # twice:
 397     symbol_defs = list(symbol_defs)
 398
 399     # A map { (name, revision) : [index,...] } of the indexes where
 400     # symbol definitions name=revision were found:
 401     known_definitions = {}
 402     for (i, symbol_def) in enumerate(symbol_defs):
 403       known_definitions.setdefault(symbol_def, []).append(i)
 404
 405     # A set of the indexes of entries that have to be removed from
 406     # symbol_defs:
 407     dup_indexes = set()
 408     for ((name, revision), indexes) in known_definitions.iteritems():
 409       if len(indexes) > 1:
 410         logger.verbose(
 411             "in %r:\n"
 412             "   symbol %s:%s defined multiple times; ignoring duplicates\n"
 413             % (self.cvs_file.rcs_path, name, revision,)
 414             )
 415         dup_indexes.update(indexes[:-1])
 416
 417     for (i, symbol_def) in enumerate(symbol_defs):
 418       if i not in dup_indexes:
 419         yield symbol_def
 420
 421   def _process_duplicate_defs(self, symbol_defs):
 422     """Iterate through SYMBOL_DEFS, processing duplicate names.
 423
 424     Duplicate definitions of symbol names have been seen in the wild,
 425     and they can also happen when --symbol-transform is used.  If a
 426     symbol is defined multiple times, then it is a fatal error.  This
 427     method should be called after _eliminate_trivial_duplicate_defs()."""
 428
 429     # Make a copy, since we have to access multiple times:
 430     symbol_defs = list(symbol_defs)
 431
 432     # A map {name : [index,...]} mapping the names of symbols to a
 433     # list of their definitions' indexes in symbol_defs:
 434     known_symbols = {}
 435     for (i, (name, revision)) in enumerate(symbol_defs):
 436       known_symbols.setdefault(name, []).append(i)
 437
 438     known_symbols = known_symbols.items()
 439     known_symbols.sort()
 440     dup_indexes = set()
 441     for (name, indexes) in known_symbols:
 442       if len(indexes) > 1:
 443         # This symbol was defined multiple times.
 444         self.collect_data.record_fatal_error(
 445             "Multiple definitions of the symbol '%s' in '%s': %s" % (
 446                 name, self.cvs_file.rcs_path,
 447                 ' '.join([symbol_defs[i][1] for i in indexes]),
 448                 )
 449             )
 450         # Ignore all but the last definition for now, to allow the
 451         # conversion to proceed:
 452         dup_indexes.update(indexes[:-1])
 453
 454     for (i, symbol_def) in enumerate(symbol_defs):
 455       if i not in dup_indexes:
 456         yield symbol_def
 457
 458   def _process_symbol(self, name, revision):
 459     """Process a symbol called NAME, which is associated with REVISON.
 460
 461     REVISION is a canonical revision number with zeros removed, for
 462     example: '1.7', '1.7.2', or '1.1.1' or '1.1.1.1'.  NAME is a
 463     transformed branch or tag name."""
 464
 465     # Add symbol to our records:
 466     if is_branch_revision_number(revision):
 467       self._add_branch(name, revision)
 468     else:
 469       self._add_tag(name, revision)
 470
 471   def process_symbols(self):
 472     """Process the symbol definitions from SELF._symbol_defs."""
 473
 474     symbol_defs = self._symbol_defs
 475     del self._symbol_defs
 476
 477     symbol_defs = self._eliminate_trivial_duplicate_defs(symbol_defs)
 478     symbol_defs = self._process_duplicate_defs(symbol_defs)
 479
 480     for (name, revision) in symbol_defs:
 481       self._defined_symbols.add(name)
 482       self._process_symbol(name, revision)
 483
 484   @staticmethod
 485   def rev_to_branch_number(revision):
 486     """Return the branch_number of the branch on which REVISION lies.
 487
 488     REVISION is a branch revision number with an even number of
 489     components; for example '1.7.2.1' (never '1.7.2' nor '1.7.0.2').
 490     The return value is the branch number (for example, '1.7.2').
 491     Return none iff REVISION is a trunk revision such as '1.2'."""
 492
 493     if is_trunk_revision(revision):
 494       return None
 495     return revision[:revision.rindex(".")]
 496
 497   def rev_to_branch_data(self, revision):
 498     """Return the branch_data of the branch on which REVISION lies.
 499
 500     REVISION must be a branch revision number with an even number of
 501     components; for example '1.7.2.1' (never '1.7.2' nor '1.7.0.2').
 502     Raise KeyError iff REVISION is unknown."""
 503
 504     assert not is_trunk_revision(revision)
 505
 506     return self.branches_data[self.rev_to_branch_number(revision)]
 507
 508   def rev_to_lod(self, revision):
 509     """Return the line of development on which REVISION lies.
 510
 511     REVISION must be a revision number with an even number of
 512     components.  Raise KeyError iff REVISION is unknown."""
 513
 514     if is_trunk_revision(revision):
 515       return self.pdc.trunk
 516     else:
 517       return self.rev_to_branch_data(revision).symbol
 518
 519
 520 class _FileDataCollector(cvs2svn_rcsparse.Sink):
 521   """Class responsible for collecting RCS data for a particular file.
 522
 523   Any collected data that need to be remembered are stored into the
 524   referenced CollectData instance."""
 525
 526   def __init__(self, pdc, cvs_file):
 527     """Create an object that is prepared to receive data for CVS_FILE.
 528     CVS_FILE is a CVSFile instance.  COLLECT_DATA is used to store the
 529     information collected about the file."""
 530
 531     self.pdc = pdc
 532     self.cvs_file = cvs_file
 533
 534     self.collect_data = self.pdc.collect_data
 535     self.project = self.cvs_file.project
 536
 537     # A place to store information about the symbols in this file:
 538     self.sdc = _SymbolDataCollector(self, self.cvs_file)
 539
 540     # { revision : _RevisionData instance }
 541     self._rev_data = { }
 542
 543     # Lists [ (parent, child) ] of revision number pairs indicating
 544     # that revision child depends on revision parent along the main
 545     # line of development.
 546     self._primary_dependencies = []
 547
 548     # If set, this is an RCS branch number -- rcsparse calls this the
 549     # "principal branch", but CVS and RCS refer to it as the "default
 550     # branch", so that's what we call it, even though the rcsparse API
 551     # setter method is still 'set_principal_branch'.
 552     self.default_branch = None
 553
 554     # True iff revision 1.1 of the file appears to have been imported
 555     # (as opposed to added normally).
 556     self._file_imported = False
 557
 558   def _get_rev_id(self, revision):
 559     if revision is None:
 560       return None
 561     return self._rev_data[revision].cvs_rev_id
 562
 563   def set_principal_branch(self, branch):
 564     """This is a callback method declared in Sink."""
 565
 566     if branch.find('.') == -1:
 567       # This just sets the default branch to trunk.  Normally this
 568       # shouldn't occur, but it has been seen in at least one CVS
 569       # repository.  Just ignore it.
 570       return
 571
 572     m = _branch_revision_re.match(branch)
 573     if not m:
 574       self.collect_data.record_fatal_error(
 575           'The default branch %s in file %r is not a valid branch number'
 576           % (branch, self.cvs_file.rcs_path,)
 577           )
 578       return
 579
 580     branch = m.group(1) + m.group(2)
 581     if branch.count('.') != 2:
 582       # We don't know how to deal with a non-top-level default
 583       # branch (what does CVS do?).  So if this case is detected,
 584       # punt:
 585       self.collect_data.record_fatal_error(
 586           'The default branch %s in file %r is not a top-level branch'
 587           % (branch, self.cvs_file.rcs_path,)
 588           )
 589       return
 590
 591     self.default_branch = branch
 592
 593   def define_tag(self, name, revision):
 594     """Remember the symbol name and revision, but don't process them yet.
 595
 596     This is a callback method declared in Sink."""
 597
 598     self.sdc.define_symbol(name, revision)
 599
 600   def set_expansion(self, mode):
 601     """This is a callback method declared in Sink."""
 602
 603     self.cvs_file.mode = mode
 604
 605   def admin_completed(self):
 606     """This is a callback method declared in Sink."""
 607
 608     self.sdc.process_symbols()
 609
 610   def define_revision(self, revision, timestamp, author, state,
 611                       branches, next):
 612     """This is a callback method declared in Sink."""
 613
 614     for branch in branches:
 615       try:
 616         branch_data = self.sdc.rev_to_branch_data(branch)
 617       except KeyError:
 618         # Normally we learn about the branches from the branch names
 619         # and numbers parsed from the symbolic name header.  But this
 620         # must have been an unlabeled branch that slipped through the
 621         # net.  Generate a name for it and create a _BranchData record
 622         # for it now.
 623         branch_data = self.sdc._add_unlabeled_branch(
 624             self.sdc.rev_to_branch_number(branch))
 625
 626       assert branch_data.child is None
 627       branch_data.child = branch
 628
 629     if revision in self._rev_data:
 630       # This revision has already been seen.
 631       logger.error('File %r contains duplicate definitions of revision %s.'
 632                   % (self.cvs_file.rcs_path, revision,))
 633       raise RuntimeError()
 634
 635     # Record basic information about the revision:
 636     rev_data = _RevisionData(
 637         self.collect_data.item_key_generator.gen_id(),
 638         revision, int(timestamp), author, state)
 639     self._rev_data[revision] = rev_data
 640
 641     # When on trunk, the RCS 'next' revision number points to what
 642     # humans might consider to be the 'previous' revision number.  For
 643     # example, 1.3's RCS 'next' is 1.2.
 644     #
 645     # However, on a branch, the RCS 'next' revision number really does
 646     # point to what humans would consider to be the 'next' revision
 647     # number.  For example, 1.1.2.1's RCS 'next' would be 1.1.2.2.
 648     #
 649     # In other words, in RCS, 'next' always means "where to find the next
 650     # deltatext that you need this revision to retrieve.
 651     #
 652     # That said, we don't *want* RCS's behavior here, so we determine
 653     # whether we're on trunk or a branch and set the dependencies
 654     # accordingly.
 655     if next:
 656       if is_trunk_revision(revision):
 657         self._primary_dependencies.append( (next, revision,) )
 658       else:
 659         self._primary_dependencies.append( (revision, next,) )
 660
 661   def tree_completed(self):
 662     """The revision tree has been parsed.
 663
 664     Analyze it for consistency and connect some loose ends.
 665
 666     This is a callback method declared in Sink."""
 667
 668     self._resolve_primary_dependencies()
 669     self._resolve_branch_dependencies()
 670     self._sort_branches()
 671     self._resolve_tag_dependencies()
 672
 673     # Compute the preliminary CVSFileItems for this file:
 674     cvs_items = []
 675     cvs_items.extend(self._get_cvs_revisions())
 676     cvs_items.extend(self._get_cvs_branches())
 677     cvs_items.extend(self._get_cvs_tags())
 678     self._cvs_file_items = CVSFileItems(
 679         self.cvs_file, self.pdc.trunk, cvs_items
 680         )
 681
 682     self._cvs_file_items.check_link_consistency()
 683
 684   def _resolve_primary_dependencies(self):
 685     """Resolve the dependencies listed in self._primary_dependencies."""
 686
 687     for (parent, child,) in self._primary_dependencies:
 688       parent_data = self._rev_data[parent]
 689       assert parent_data.child is None
 690       parent_data.child = child
 691
 692       child_data = self._rev_data[child]
 693       assert child_data.parent is None
 694       child_data.parent = parent
 695
 696   def _resolve_branch_dependencies(self):
 697     """Resolve dependencies involving branches."""
 698
 699     for branch_data in self.sdc.branches_data.values():
 700       # The branch_data's parent has the branch as a child regardless
 701       # of whether the branch had any subsequent commits:
 702       try:
 703         parent_data = self._rev_data[branch_data.parent]
 704       except KeyError:
 705         logger.warn(
 706             'In %r:\n'
 707             '    branch %r references non-existing revision %s\n'
 708             '    and will be ignored.'
 709             % (self.cvs_file.rcs_path, branch_data.symbol.name,
 710                branch_data.parent,))
 711         del self.sdc.branches_data[branch_data.branch_number]
 712       else:
 713         parent_data.branches_data.append(branch_data)
 714
 715         # If the branch has a child (i.e., something was committed on
 716         # the branch), then we store a reference to the branch_data
 717         # there, define the child's parent to be the branch's parent,
 718         # and list the child in the branch parent's branches_revs_data:
 719         if branch_data.child is not None:
 720           child_data = self._rev_data[branch_data.child]
 721           assert child_data.parent_branch_data is None
 722           child_data.parent_branch_data = branch_data
 723           assert child_data.parent is None
 724           child_data.parent = branch_data.parent
 725           parent_data.branches_revs_data.append(branch_data.child)
 726
 727   def _sort_branches(self):
 728     """Sort the branches sprouting from each revision in creation order.
 729
 730     Creation order is taken to be the reverse of the order that they
 731     are listed in the symbols part of the RCS file.  (If a branch is
 732     created then deleted, a later branch can be assigned the recycled
 733     branch number; therefore branch numbers are not an indication of
 734     creation order.)"""
 735
 736     for rev_data in self._rev_data.values():
 737       rev_data.branches_data.sort(lambda a, b: - cmp(a.id, b.id))
 738
 739   def _resolve_tag_dependencies(self):
 740     """Resolve dependencies involving tags."""
 741
 742     for (rev, tag_data_list) in self.sdc.tags_data.items():
 743       try:
 744         parent_data = self._rev_data[rev]
 745       except KeyError:
 746         logger.warn(
 747             'In %r:\n'
 748             '    the following tag(s) reference non-existing revision %s\n'
 749             '    and will be ignored:\n'
 750             '    %s' % (
 751                 self.cvs_file.rcs_path, rev,
 752                 ', '.join([repr(tag_data.symbol.name)
 753                            for tag_data in tag_data_list]),))
 754         del self.sdc.tags_data[rev]
 755       else:
 756         for tag_data in tag_data_list:
 757           assert tag_data.rev == rev
 758           # The tag_data's rev has the tag as a child:
 759           parent_data.tags_data.append(tag_data)
 760
 761   def _get_cvs_branches(self):
 762     """Generate the CVSBranches present in this file."""
 763
 764     for branch_data in self.sdc.branches_data.values():
 765       yield CVSBranch(
 766           branch_data.id, self.cvs_file, branch_data.symbol,
 767           branch_data.branch_number,
 768           self.sdc.rev_to_lod(branch_data.parent),
 769           self._get_rev_id(branch_data.parent),
 770           self._get_rev_id(branch_data.child),
 771           None,
 772           )
 773
 774   def _get_cvs_tags(self):
 775     """Generate the CVSTags present in this file."""
 776
 777     for tags_data in self.sdc.tags_data.values():
 778       for tag_data in tags_data:
 779         yield CVSTag(
 780             tag_data.id, self.cvs_file, tag_data.symbol,
 781             self.sdc.rev_to_lod(tag_data.rev),
 782             self._get_rev_id(tag_data.rev),
 783             None,
 784             )
 785
 786   def set_description(self, description):
 787     """This is a callback method declared in Sink."""
 788
 789     self.cvs_file.description = description
 790     self.cvs_file.determine_file_properties(Ctx().file_property_setters)
 791
 792   def set_revision_info(self, revision, log, text):
 793     """This is a callback method declared in Sink."""
 794
 795     rev_data = self._rev_data[revision]
 796     cvs_rev = self._cvs_file_items[rev_data.cvs_rev_id]
 797
 798     if cvs_rev.metadata_id is not None:
 799       # Users have reported problems with repositories in which the
 800       # deltatext block for revision 1.1 appears twice.  It is not
 801       # known whether this results from a CVS/RCS bug, or from botched
 802       # hand-editing of the repository.  In any case, empirically, cvs
 803       # and rcs both use the first version when checking out data, so
 804       # that's what we will do.  (For the record: "cvs log" fails on
 805       # such a file; "rlog" prints the log message from the first
 806       # block and ignores the second one.)
 807       logger.warn(
 808           "%s: in '%s':\n"
 809           "   Deltatext block for revision %s appeared twice;\n"
 810           "   ignoring the second occurrence.\n"
 811           % (warning_prefix, self.cvs_file.rcs_path, revision,)
 812           )
 813       return
 814
 815     if is_trunk_revision(revision):
 816       branch_name = None
 817     else:
 818       branch_name = self.sdc.rev_to_branch_data(revision).symbol.name
 819
 820     cvs_rev.metadata_id = self.collect_data.metadata_logger.store(
 821         self.project, branch_name, rev_data.author, log
 822         )
 823     cvs_rev.deltatext_exists = bool(text)
 824
 825     # If this is revision 1.1, determine whether the file appears to
 826     # have been created via 'cvs add' instead of 'cvs import'.  The
 827     # test is that the log message CVS uses for 1.1 in imports is
 828     # "Initial revision\n" with no period.  (This fact helps determine
 829     # whether this file might have had a default branch in the past.)
 830     if revision == '1.1':
 831       self._file_imported = (log == 'Initial revision\n')
 832
 833   def parse_completed(self):
 834     """Finish the processing of this file.
 835
 836     This is a callback method declared in Sink."""
 837
 838     # Make sure that there was an info section for each revision:
 839     for cvs_item in self._cvs_file_items.values():
 840       if isinstance(cvs_item, CVSRevision) and cvs_item.metadata_id is None:
 841         self.collect_data.record_fatal_error(
 842             '%r has no deltatext section for revision %s'
 843             % (self.cvs_file.rcs_path, cvs_item.rev,)
 844             )
 845
 846   def _determine_operation(self, rev_data):
 847     prev_rev_data = self._rev_data.get(rev_data.parent)
 848     return cvs_revision_type_map[(
 849         rev_data.state != 'dead',
 850         prev_rev_data is not None and prev_rev_data.state != 'dead',
 851         )]
 852
 853   def _get_cvs_revisions(self):
 854     """Generate the CVSRevisions present in this file."""
 855
 856     for rev_data in self._rev_data.itervalues():
 857       yield self._get_cvs_revision(rev_data)
 858
 859   def _get_cvs_revision(self, rev_data):
 860     """Create and return a CVSRevision for REV_DATA."""
 861
 862     branch_ids = [
 863         branch_data.id
 864         for branch_data in rev_data.branches_data
 865         ]
 866
 867     branch_commit_ids = [
 868         self._get_rev_id(rev)
 869         for rev in rev_data.branches_revs_data
 870         ]
 871
 872     tag_ids = [
 873         tag_data.id
 874         for tag_data in rev_data.tags_data
 875         ]
 876
 877     revision_type = self._determine_operation(rev_data)
 878
 879     return revision_type(
 880         self._get_rev_id(rev_data.rev), self.cvs_file,
 881         rev_data.timestamp, None,
 882         self._get_rev_id(rev_data.parent),
 883         self._get_rev_id(rev_data.child),
 884         rev_data.rev,
 885         True,
 886         self.sdc.rev_to_lod(rev_data.rev),
 887         rev_data.get_first_on_branch_id(),
 888         False, None, None,
 889         tag_ids, branch_ids, branch_commit_ids,
 890         rev_data.revision_reader_token
 891         )
 892
 893   def get_cvs_file_items(self):
 894     """Finish up and return a CVSFileItems instance for this file.
 895
 896     This method must only be called once."""
 897
 898     self._process_ntdbrs()
 899
 900     # Break a circular reference loop, allowing the memory for self
 901     # and sdc to be freed.
 902     del self.sdc
 903
 904     return self._cvs_file_items
 905
 906   def _process_ntdbrs(self):
 907     """Fix up any non-trunk default branch revisions (if present).
 908
 909     If a non-trunk default branch is determined to have existed, yield
 910     the _RevisionData.ids for all revisions that were once non-trunk
 911     default revisions, in dependency order.
 912
 913     There are two cases to handle:
 914
 915     One case is simple.  The RCS file lists a default branch
 916     explicitly in its header, such as '1.1.1'.  In this case, we know
 917     that every revision on the vendor branch is to be treated as head
 918     of trunk at that point in time.
 919
 920     But there's also a degenerate case.  The RCS file does not
 921     currently have a default branch, yet we can deduce that for some
 922     period in the past it probably *did* have one.  For example, the
 923     file has vendor revisions 1.1.1.1 -> 1.1.1.96, all of which are
 924     dated before 1.2, and then it has 1.1.1.97 -> 1.1.1.100 dated
 925     after 1.2.  In this case, we should record 1.1.1.96 as the last
 926     vendor revision to have been the head of the default branch.
 927
 928     If any non-trunk default branch revisions are found:
 929
 930     - Set their ntdbr members to True.
 931
 932     - Connect the last one with revision 1.2.
 933
 934     - Remove revision 1.1 if it is not needed.
 935
 936     """
 937
 938     try:
 939       if self.default_branch:
 940         try:
 941           vendor_cvs_branch_id = self.sdc.branches_data[self.default_branch].id
 942         except KeyError:
 943           logger.warn(
 944               '%s: In %s:\n'
 945               '    vendor branch %r is not present in file and will be ignored.'
 946               % (warning_prefix, self.cvs_file.rcs_path, self.default_branch,)
 947               )
 948           self.default_branch = None
 949           return
 950         vendor_lod_items = self._cvs_file_items.get_lod_items(
 951             self._cvs_file_items[vendor_cvs_branch_id]
 952             )
 953         if not self._cvs_file_items.process_live_ntdb(vendor_lod_items):
 954           return
 955       elif self._file_imported:
 956         vendor_branch_data = self.sdc.branches_data.get('1.1.1')
 957         if vendor_branch_data is None:
 958           return
 959         else:
 960           vendor_lod_items = self._cvs_file_items.get_lod_items(
 961               self._cvs_file_items[vendor_branch_data.id]
 962               )
 963           if not self._cvs_file_items.process_historical_ntdb(
 964                 vendor_lod_items
 965                 ):
 966             return
 967       else:
 968         return
 969     except VendorBranchError, e:
 970       self.collect_data.record_fatal_error(str(e))
 971       return
 972
 973     if self._file_imported:
 974       self._cvs_file_items.imported_remove_1_1(vendor_lod_items)
 975
 976     self._cvs_file_items.check_link_consistency()
 977
 978
 979 class _ProjectDataCollector:
 980   def __init__(self, collect_data, project):
 981     self.collect_data = collect_data
 982     self.project = project
 983     self.num_files = 0
 984
 985     # The Trunk LineOfDevelopment object for this project:
 986     self.trunk = Trunk(
 987         self.collect_data.symbol_key_generator.gen_id(), self.project
 988         )
 989     self.project.trunk_id = self.trunk.id
 990
 991     # This causes a record for self.trunk to spring into existence:
 992     self.collect_data.register_trunk(self.trunk)
 993
 994     # A map { name -> Symbol } for all known symbols in this project.
 995     # The symbols listed here are undifferentiated into Branches and
 996     # Tags because the same name might appear as a branch in one file
 997     # and a tag in another.
 998     self.symbols = {}
 999
1000     # A map { (old_name, new_name) : count } indicating how many files
1001     # were affected by each each symbol name transformation:
1002     self.symbol_transform_counts = {}
1003
1004   def get_symbol(self, name):
1005     """Return the Symbol object for the symbol named NAME in this project.
1006
1007     If such a symbol does not yet exist, allocate a new symbol_id,
1008     create a Symbol instance, store it in self.symbols, and return it."""
1009
1010     symbol = self.symbols.get(name)
1011     if symbol is None:
1012       symbol = Symbol(
1013           self.collect_data.symbol_key_generator.gen_id(),
1014           self.project, name)
1015       self.symbols[name] = symbol
1016     return symbol
1017
1018   def log_symbol_transform(self, old_name, new_name):
1019     """Record that OLD_NAME was transformed to NEW_NAME in one file.
1020
1021     This information is used to generated a statistical summary of
1022     symbol transforms."""
1023
1024     try:
1025       self.symbol_transform_counts[old_name, new_name] += 1
1026     except KeyError:
1027       self.symbol_transform_counts[old_name, new_name] = 1
1028
1029   def summarize_symbol_transforms(self):
1030     if self.symbol_transform_counts and logger.is_on(logger.NORMAL):
1031       logger.normal('Summary of symbol transforms:')
1032       transforms = self.symbol_transform_counts.items()
1033       transforms.sort()
1034       for ((old_name, new_name), count) in transforms:
1035         if new_name is None:
1036           logger.normal('    "%s" ignored in %d files' % (old_name, count,))
1037         else:
1038           logger.normal(
1039               '    "%s" transformed to "%s" in %d files'
1040               % (old_name, new_name, count,)
1041               )
1042
1043   def process_file(self, cvs_file):
1044     logger.normal(cvs_file.rcs_path)
1045     fdc = _FileDataCollector(self, cvs_file)
1046     try:
1047       cvs2svn_rcsparse.parse(open(cvs_file.rcs_path, 'rb'), fdc)
1048     except (cvs2svn_rcsparse.common.RCSParseError, RuntimeError):
1049       self.collect_data.record_fatal_error(
1050           "%r is not a valid ,v file" % (cvs_file.rcs_path,)
1051           )
1052       # Abort the processing of this file, but let the pass continue
1053       # with other files:
1054       return
1055     except ValueError, e:
1056       self.collect_data.record_fatal_error(
1057           "%r is not a valid ,v file (%s)" % (cvs_file.rcs_path, str(e),)
1058           )
1059       # Abort the processing of this file, but let the pass continue
1060       # with other files:
1061       return
1062     except:
1063       logger.warn("Exception occurred while parsing %s" % cvs_file.rcs_path)
1064       raise
1065     else:
1066       self.num_files += 1
1067
1068     return fdc.get_cvs_file_items()
1069
1070
1071 class CollectData:
1072   """Repository for data collected by parsing the CVS repository files.
1073
1074   This class manages the databases into which information collected
1075   from the CVS repository is stored.  The data are stored into this
1076   class by _FileDataCollector instances, one of which is created for
1077   each file to be parsed."""
1078
1079   def __init__(self, stats_keeper):
1080     self._cvs_item_store = NewCVSItemStore(
1081         artifact_manager.get_temp_file(config.CVS_ITEMS_STORE))
1082     self.metadata_db = MetadataDatabase(
1083         artifact_manager.get_temp_file(config.METADATA_STORE),
1084         artifact_manager.get_temp_file(config.METADATA_INDEX_TABLE),
1085         DB_OPEN_NEW,
1086         )
1087     self.metadata_logger = MetadataLogger(self.metadata_db)
1088     self.fatal_errors = []
1089     self.num_files = 0
1090     self.symbol_stats = SymbolStatisticsCollector()
1091     self.stats_keeper = stats_keeper
1092
1093     # Key generator for CVSItems:
1094     self.item_key_generator = KeyGenerator()
1095
1096     # Key generator for Symbols:
1097     self.symbol_key_generator = KeyGenerator()
1098
1099   def record_fatal_error(self, err):
1100     """Record that fatal error ERR was found.
1101
1102     ERR is a string (without trailing newline) describing the error.
1103     Output the error to stderr immediately, and record a copy to be
1104     output again in a summary at the end of CollectRevsPass."""
1105
1106     err = '%s: %s' % (error_prefix, err,)
1107     logger.error(err + '\n')
1108     self.fatal_errors.append(err)
1109
1110   def add_cvs_directory(self, cvs_directory):
1111     """Record CVS_DIRECTORY."""
1112
1113     Ctx()._cvs_path_db.log_path(cvs_directory)
1114
1115   def add_cvs_file_items(self, cvs_file_items):
1116     """Record the information from CVS_FILE_ITEMS.
1117
1118     Store the CVSFile to _cvs_path_db under its persistent id, store
1119     the CVSItems, and record the CVSItems to self.stats_keeper."""
1120
1121     Ctx()._cvs_path_db.log_path(cvs_file_items.cvs_file)
1122     self._cvs_item_store.add(cvs_file_items)
1123
1124     self.stats_keeper.record_cvs_file(cvs_file_items.cvs_file)
1125     for cvs_item in cvs_file_items.values():
1126       self.stats_keeper.record_cvs_item(cvs_item)
1127
1128   def register_trunk(self, trunk):
1129     """Create a symbol statistics record for the specified trunk LOD."""
1130
1131     # This causes a record to spring into existence:
1132     self.symbol_stats[trunk]
1133
1134   def _process_cvs_file_items(self, cvs_file_items):
1135     """Process the CVSFileItems from one CVSFile."""
1136
1137     # Remove an initial delete on trunk if it is not needed:
1138     cvs_file_items.remove_unneeded_initial_trunk_delete(self.metadata_db)
1139
1140     # Remove initial branch deletes that are not needed:
1141     cvs_file_items.remove_initial_branch_deletes(self.metadata_db)
1142
1143     # If this is a --trunk-only conversion, discard all branches and
1144     # tags, then draft any non-trunk default branch revisions to
1145     # trunk:
1146     if Ctx().trunk_only:
1147       cvs_file_items.exclude_non_trunk()
1148
1149     cvs_file_items.check_link_consistency()
1150
1151     self.add_cvs_file_items(cvs_file_items)
1152     self.symbol_stats.register(cvs_file_items)
1153
1154   def process_project(self, project, cvs_paths):
1155     pdc = _ProjectDataCollector(self, project)
1156
1157     found_rcs_file = False
1158     for cvs_path in cvs_paths:
1159       if isinstance(cvs_path, CVSDirectory):
1160         self.add_cvs_directory(cvs_path)
1161       else:
1162         cvs_file_items = pdc.process_file(cvs_path)
1163         self._process_cvs_file_items(cvs_file_items)
1164         found_rcs_file = True
1165
1166     if not found_rcs_file:
1167       self.record_fatal_error(
1168           'No RCS files found under %r!\n'
1169           'Are you absolutely certain you are pointing cvs2svn\n'
1170           'at a CVS repository?\n'
1171           % (project.project_cvs_repos_path,)
1172           )
1173
1174     pdc.summarize_symbol_transforms()
1175
1176     self.num_files += pdc.num_files
1177     logger.verbose('Processed', self.num_files, 'files')
1178
1179   def _register_empty_subdirectories(self):
1180     """Set the CVSDirectory.empty_subdirectory_id members."""
1181
1182     directories = set(
1183         path
1184         for path in Ctx()._cvs_path_db.itervalues()
1185         if isinstance(path, CVSDirectory)
1186         )
1187     for path in Ctx()._cvs_path_db.itervalues():
1188       if isinstance(path, CVSFile):
1189         directory = path.parent_directory
1190         while directory is not None and directory in directories:
1191           directories.remove(directory)
1192           directory = directory.parent_directory
1193     for directory in directories:
1194       if directory.parent_directory is not None:
1195         directory.parent_directory.empty_subdirectory_ids.append(directory.id)
1196
1197   def close(self):
1198     """Close the data structures associated with this instance.
1199
1200     Return a list of fatal errors encountered while processing input.
1201     Each list entry is a string describing one fatal error."""
1202
1203     self.symbol_stats.purge_ghost_symbols()
1204     self.symbol_stats.close()
1205     self.symbol_stats = None
1206     self.metadata_logger = None
1207     self.metadata_db.close()
1208     self.metadata_db = None
1209     self._cvs_item_store.close()
1210     self._cvs_item_store = None
1211     self._register_empty_subdirectories()
1212     retval = self.fatal_errors
1213     self.fatal_errors = None
1214     return retval
1215
1216