cvs2svn_lib/collect_data.py

   1 # (Be in -*- python -*- mode.)
   2 #
   3 # ====================================================================
   4 # Copyright (c) 2000-2009 CollabNet.  All rights reserved.
   5 #
   6 # This software is licensed as described in the file COPYING, which
   7 # you should have received as part of this distribution.  The terms
   8 # are also available at http://subversion.tigris.org/license-1.html.
   9 # If newer versions of this license are posted there, you may use a
  10 # newer version instead, at your option.
  11 #
  12 # This software consists of voluntary contributions made by many
  13 # individuals.  For exact contribution history, see the revision
  14 # history and logs, available at http://cvs2svn.tigris.org/.
  15 # ====================================================================
  16
  17 """Data collection classes.
  18
  19 This module contains the code used to collect data from the CVS
  20 repository.  It parses *,v files, recording all useful information
  21 except for the actual file contents.
  22
  23 As a *,v file is parsed, the information pertaining to the file is
  24 accumulated in memory, mostly in _RevisionData, _BranchData, and
  25 _TagData objects.  When parsing is complete, a final pass is made over
  26 the data to create some final dependency links, collect statistics,
  27 etc., then the _*Data objects are converted into CVSItem objects
  28 (CVSRevision, CVSBranch, and CVSTag respectively) and the CVSItems are
  29 dumped into databases.
  30
  31 During the data collection, persistent unique ids are allocated to
  32 many types of objects: CVSFile, Symbol, and CVSItems.  CVSItems are a
  33 special case.  CVSItem ids are unique across all CVSItem types, and
  34 the ids are carried over from the corresponding data collection
  35 objects:
  36
  37     _RevisionData -> CVSRevision
  38
  39     _BranchData -> CVSBranch
  40
  41     _TagData -> CVSTag
  42
  43 In a later pass it is possible to convert tags <-> branches.  But even
  44 if this occurs, the new branch or tag uses the same id as the old tag
  45 or branch.
  46
  47 """
  48
  49
  50 import re
  51
  52 from cvs2svn_lib import config
  53 from cvs2svn_lib.common import DB_OPEN_NEW
  54 from cvs2svn_lib.common import warning_prefix
  55 from cvs2svn_lib.common import error_prefix
  56 from cvs2svn_lib.common import is_trunk_revision
  57 from cvs2svn_lib.common import is_branch_revision_number
  58 from cvs2svn_lib.log import logger
  59 from cvs2svn_lib.context import Ctx
  60 from cvs2svn_lib.artifact_manager import artifact_manager
  61 from cvs2svn_lib.cvs_path import CVSFile
  62 from cvs2svn_lib.cvs_path import CVSDirectory
  63 from cvs2svn_lib.symbol import Symbol
  64 from cvs2svn_lib.symbol import Trunk
  65 from cvs2svn_lib.cvs_item import CVSRevision
  66 from cvs2svn_lib.cvs_item import CVSBranch
  67 from cvs2svn_lib.cvs_item import CVSTag
  68 from cvs2svn_lib.cvs_item import cvs_revision_type_map
  69 from cvs2svn_lib.cvs_file_items import VendorBranchError
  70 from cvs2svn_lib.cvs_file_items import CVSFileItems
  71 from cvs2svn_lib.key_generator import KeyGenerator
  72 from cvs2svn_lib.cvs_item_database import NewCVSItemStore
  73 from cvs2svn_lib.symbol_statistics import SymbolStatisticsCollector
  74 from cvs2svn_lib.metadata_database import MetadataDatabase
  75 from cvs2svn_lib.metadata_database import MetadataLogger
  76
  77 from cvs2svn_lib.rcsparser import Sink
  78 from cvs2svn_lib.rcsparser import parse
  79 from cvs2svn_lib.rcsparser import RCSParseError
  80
  81
  82 # A regular expression defining "valid" revision numbers (used to
  83 # check that symbol definitions are reasonable).
  84 _valid_revision_re = re.compile(r'''
  85     ^
  86     (?:\d+\.)+          # Digit groups with trailing dots
  87     \d+                 # And the last digit group.
  88     $
  89     ''', re.VERBOSE)
  90
  91 _branch_revision_re = re.compile(r'''
  92     ^
  93     ((?:\d+\.\d+\.)+)   # A nonzero even number of digit groups w/trailing dot
  94     (?:0\.)?            # CVS sticks an extra 0 here; RCS does not
  95     (\d+)               # And the last digit group
  96     $
  97     ''', re.VERBOSE)
  98
  99
 100 def is_same_line_of_development(rev1, rev2):
 101   """Return True if rev1 and rev2 are on the same line of
 102   development (i.e., both on trunk, or both on the same branch);
 103   return False otherwise.  Either rev1 or rev2 can be None, in
 104   which case automatically return False."""
 105
 106   if rev1 is None or rev2 is None:
 107     return False
 108   if is_trunk_revision(rev1) and is_trunk_revision(rev2):
 109     # Trunk revisions have to be handled specially because the main
 110     # trunk version number can be changed; e.g., from 1 to 2.
 111     return True
 112   if rev1[0:rev1.rfind('.')] == rev2[0:rev2.rfind('.')]:
 113     return True
 114   return False
 115
 116
 117 class _RevisionData:
 118   """We track the state of each revision so that in set_revision_info,
 119   we can determine if our op is an add/change/delete.  We can do this
 120   because in set_revision_info, we'll have all of the _RevisionData
 121   for a file at our fingertips, and we need to examine the state of
 122   our prev_rev to determine if we're an add or a change.  Without the
 123   state of the prev_rev, we are unable to distinguish between an add
 124   and a change."""
 125
 126   def __init__(self, cvs_rev_id, rev, timestamp, author, state):
 127     # The id of this revision:
 128     self.cvs_rev_id = cvs_rev_id
 129     self.rev = rev
 130     self.timestamp = timestamp
 131     self.author = author
 132     self.state = state
 133
 134     # If this is the first revision on a branch, then this is the
 135     # branch_data of that branch; otherwise it is None.
 136     self.parent_branch_data = None
 137
 138     # The revision number of the parent of this revision along the
 139     # same line of development, if any.  For the first revision R on a
 140     # branch, we consider the revision from which R sprouted to be the
 141     # 'parent'.  If this is the root revision in the file's revision
 142     # tree, then this field is None.
 143     #
 144     # Note that this revision can't be determined arithmetically (due
 145     # to cvsadmin -o), which is why this field is necessary.
 146     self.parent = None
 147
 148     # The revision number of the primary child of this revision (the
 149     # child along the same line of development), if any; otherwise,
 150     # None.
 151     self.child = None
 152
 153     # The _BranchData instances of branches that sprout from this
 154     # revision, sorted in ascending order by branch number.  It would
 155     # be inconvenient to initialize it here because we would have to
 156     # scan through all branches known by the _SymbolDataCollector to
 157     # find the ones having us as the parent.  Instead, this
 158     # information is filled in by
 159     # _FileDataCollector._resolve_dependencies() and sorted by
 160     # _FileDataCollector._sort_branches().
 161     self.branches_data = []
 162
 163     # The revision numbers of the first commits on any branches on
 164     # which commits occurred.  This dependency is kept explicitly
 165     # because otherwise a revision-only topological sort would miss
 166     # the dependency that exists via branches_data.
 167     self.branches_revs_data = []
 168
 169     # The _TagData instances of tags that are connected to this
 170     # revision.
 171     self.tags_data = []
 172
 173     # A token that may be set by a RevisionCollector, then used by
 174     # RevisionReader to obtain the text again.
 175     self.revision_reader_token = None
 176
 177   def get_first_on_branch_id(self):
 178     return self.parent_branch_data and self.parent_branch_data.id
 179
 180
 181 class _SymbolData:
 182   """Collection area for information about a symbol in a single CVSFile.
 183
 184   SYMBOL is an instance of Symbol, undifferentiated as a Branch or a
 185   Tag regardless of whether self is a _BranchData or a _TagData."""
 186
 187   def __init__(self, id, symbol):
 188     """Initialize an object for SYMBOL."""
 189
 190     # The unique id that will be used for this particular symbol in
 191     # this particular file.  This same id will be used for the CVSItem
 192     # that is derived from this instance.
 193     self.id = id
 194
 195     # An instance of Symbol.
 196     self.symbol = symbol
 197
 198
 199 class _BranchData(_SymbolData):
 200   """Collection area for information about a Branch in a single CVSFile."""
 201
 202   def __init__(self, id, symbol, branch_number):
 203     _SymbolData.__init__(self, id, symbol)
 204
 205     # The branch number (e.g., '1.5.2') of this branch.
 206     self.branch_number = branch_number
 207
 208     # The revision number of the revision from which this branch
 209     # sprouts (e.g., '1.5').
 210     self.parent = self.branch_number[:self.branch_number.rindex(".")]
 211
 212     # The revision number of the first commit on this branch, if any
 213     # (e.g., '1.5.2.1'); otherwise, None.
 214     self.child = None
 215
 216
 217 class _TagData(_SymbolData):
 218   """Collection area for information about a Tag in a single CVSFile."""
 219
 220   def __init__(self, id, symbol, rev):
 221     _SymbolData.__init__(self, id, symbol)
 222
 223     # The revision number being tagged (e.g., '1.5.2.3').
 224     self.rev = rev
 225
 226
 227 class _SymbolDataCollector(object):
 228   """Collect information about symbols in a single CVSFile."""
 229
 230   def __init__(self, fdc, cvs_file):
 231     self.fdc = fdc
 232     self.cvs_file = cvs_file
 233
 234     self.pdc = self.fdc.pdc
 235     self.collect_data = self.fdc.collect_data
 236
 237     # A list [(name, revision), ...] of symbols defined in the header
 238     # of the file.  The name has already been transformed using the
 239     # symbol transform rules.  If the symbol transform rules indicate
 240     # that the symbol should be ignored, then it is never added to
 241     # this list.  This list is processed then deleted in
 242     # process_symbols().
 243     self._symbol_defs = []
 244
 245     # A set containing the transformed names of symbols in this file
 246     # (used to detect duplicates during processing of unlabeled
 247     # branches):
 248     self._defined_symbols = set()
 249
 250     # Map { branch_number : _BranchData }, where branch_number has an
 251     # odd number of digits.
 252     self.branches_data = { }
 253
 254     # Map { revision : [ tag_data ] }, where revision has an even
 255     # number of digits, and the value is a list of _TagData objects
 256     # for tags that apply to that revision.
 257     self.tags_data = { }
 258
 259   def _add_branch(self, name, branch_number):
 260     """Record that BRANCH_NUMBER is the branch number for branch NAME,
 261     and derive and record the revision from which NAME sprouts.
 262     BRANCH_NUMBER is an RCS branch number with an odd number of
 263     components, for example '1.7.2' (never '1.7.0.2').  Return the
 264     _BranchData instance (which is usually newly-created)."""
 265
 266     branch_data = self.branches_data.get(branch_number)
 267
 268     if branch_data is not None:
 269       logger.warn(
 270           "%s: in '%s':\n"
 271           "   branch '%s' already has name '%s',\n"
 272           "   cannot also have name '%s', ignoring the latter\n"
 273           % (warning_prefix,
 274              self.cvs_file.rcs_path, branch_number,
 275              branch_data.symbol.name, name)
 276           )
 277       return branch_data
 278
 279     symbol = self.pdc.get_symbol(name)
 280     branch_data = _BranchData(
 281         self.collect_data.item_key_generator.gen_id(), symbol, branch_number
 282         )
 283     self.branches_data[branch_number] = branch_data
 284     return branch_data
 285
 286   def _construct_distinct_name(self, name, original_name):
 287     """Construct a distinct symbol name from NAME.
 288
 289     If NAME is distinct, return it.  If it is already used in this
 290     file (as determined from its presence in self._defined_symbols),
 291     construct and return a new name that is not already used."""
 292
 293     if name not in self._defined_symbols:
 294       return name
 295     else:
 296       index = 1
 297       while True:
 298         dup_name = '%s-DUPLICATE-%d' % (name, index,)
 299         if dup_name not in self._defined_symbols:
 300           self.collect_data.record_fatal_error(
 301               "Symbol name '%s' is already used in '%s'.\n"
 302               "The unlabeled branch '%s' must be renamed using "
 303               "--symbol-transform."
 304               % (name, self.cvs_file.rcs_path, original_name,)
 305               )
 306           return dup_name
 307
 308   def _add_unlabeled_branch(self, branch_number):
 309     original_name = "unlabeled-" + branch_number
 310     name = self.transform_symbol(original_name, branch_number)
 311     if name is None:
 312       self.collect_data.record_fatal_error(
 313           "The unlabeled branch '%s' in '%s' contains commits.\n"
 314           "It may not be ignored via a symbol transform.  (Use --exclude "
 315           "instead.)"
 316           % (original_name, self.cvs_file.rcs_path,)
 317           )
 318       # Retain the original name to allow the conversion to continue:
 319       name = original_name
 320
 321     distinct_name = self._construct_distinct_name(name, original_name)
 322     self._defined_symbols.add(distinct_name)
 323     return self._add_branch(distinct_name, branch_number)
 324
 325   def _add_tag(self, name, revision):
 326     """Record that tag NAME refers to the specified REVISION."""
 327
 328     symbol = self.pdc.get_symbol(name)
 329     tag_data = _TagData(
 330         self.collect_data.item_key_generator.gen_id(), symbol, revision
 331         )
 332     self.tags_data.setdefault(revision, []).append(tag_data)
 333     return tag_data
 334
 335   def transform_symbol(self, name, revision):
 336     """Transform a symbol according to the project's symbol transforms.
 337
 338     Transform the symbol with the original name NAME and canonicalized
 339     revision number REVISION.  Return the new symbol name or None if
 340     the symbol should be ignored entirely.
 341
 342     Log the results of the symbol transform if necessary."""
 343
 344     old_name = name
 345     # Apply any user-defined symbol transforms to the symbol name:
 346     name = self.cvs_file.project.transform_symbol(
 347         self.cvs_file, name, revision
 348         )
 349
 350     if name is None:
 351       # Ignore symbol:
 352       self.pdc.log_symbol_transform(old_name, None)
 353       logger.verbose(
 354           "   symbol '%s'=%s ignored in %s"
 355           % (old_name, revision, self.cvs_file.rcs_path,)
 356           )
 357     else:
 358       if name != old_name:
 359         self.pdc.log_symbol_transform(old_name, name)
 360         logger.verbose(
 361             "   symbol '%s'=%s transformed to '%s' in %s"
 362             % (old_name, revision, name, self.cvs_file.rcs_path,)
 363             )
 364
 365     return name
 366
 367   def define_symbol(self, name, revision):
 368     """Record a symbol definition for later processing."""
 369
 370     # Canonicalize the revision number:
 371     revision = _branch_revision_re.sub(r'\1\2', revision)
 372
 373     # Apply any user-defined symbol transforms to the symbol name:
 374     name = self.transform_symbol(name, revision)
 375
 376     if name is not None:
 377       # Verify that the revision number is valid:
 378       if _valid_revision_re.match(revision):
 379         # The revision number is valid; record it for later processing:
 380         self._symbol_defs.append( (name, revision) )
 381       else:
 382         logger.warn(
 383             'In %r:\n'
 384             '    branch %r references invalid revision %s\n'
 385             '    and will be ignored.'
 386             % (self.cvs_file.rcs_path, name, revision,)
 387             )
 388
 389   def _eliminate_trivial_duplicate_defs(self, symbol_defs):
 390     """Iterate through SYMBOL_DEFS, Removing identical duplicate definitions.
 391
 392     Duplicate definitions of symbol names have been seen in the wild,
 393     and they can also happen when --symbol-transform is used.  If a
 394     symbol is defined to the same revision number repeatedly, then
 395     ignore all but the last definition."""
 396
 397     # Make a copy, since we have to iterate through the definitions
 398     # twice:
 399     symbol_defs = list(symbol_defs)
 400
 401     # A map { (name, revision) : [index,...] } of the indexes where
 402     # symbol definitions name=revision were found:
 403     known_definitions = {}
 404     for (i, symbol_def) in enumerate(symbol_defs):
 405       known_definitions.setdefault(symbol_def, []).append(i)
 406
 407     # A set of the indexes of entries that have to be removed from
 408     # symbol_defs:
 409     dup_indexes = set()
 410     for ((name, revision), indexes) in known_definitions.iteritems():
 411       if len(indexes) > 1:
 412         logger.verbose(
 413             "in %r:\n"
 414             "   symbol %s:%s defined multiple times; ignoring duplicates\n"
 415             % (self.cvs_file.rcs_path, name, revision,)
 416             )
 417         dup_indexes.update(indexes[:-1])
 418
 419     for (i, symbol_def) in enumerate(symbol_defs):
 420       if i not in dup_indexes:
 421         yield symbol_def
 422
 423   def _process_duplicate_defs(self, symbol_defs):
 424     """Iterate through SYMBOL_DEFS, processing duplicate names.
 425
 426     Duplicate definitions of symbol names have been seen in the wild,
 427     and they can also happen when --symbol-transform is used.  If a
 428     symbol is defined multiple times, then it is a fatal error.  This
 429     method should be called after _eliminate_trivial_duplicate_defs()."""
 430
 431     # Make a copy, since we have to access multiple times:
 432     symbol_defs = list(symbol_defs)
 433
 434     # A map {name : [index,...]} mapping the names of symbols to a
 435     # list of their definitions' indexes in symbol_defs:
 436     known_symbols = {}
 437     for (i, (name, revision)) in enumerate(symbol_defs):
 438       known_symbols.setdefault(name, []).append(i)
 439
 440     known_symbols = known_symbols.items()
 441     known_symbols.sort()
 442     dup_indexes = set()
 443     for (name, indexes) in known_symbols:
 444       if len(indexes) > 1:
 445         # This symbol was defined multiple times.
 446         self.collect_data.record_fatal_error(
 447             "Multiple definitions of the symbol '%s' in '%s': %s" % (
 448                 name, self.cvs_file.rcs_path,
 449                 ' '.join([symbol_defs[i][1] for i in indexes]),
 450                 )
 451             )
 452         # Ignore all but the last definition for now, to allow the
 453         # conversion to proceed:
 454         dup_indexes.update(indexes[:-1])
 455
 456     for (i, symbol_def) in enumerate(symbol_defs):
 457       if i not in dup_indexes:
 458         yield symbol_def
 459
 460   def _process_symbol(self, name, revision):
 461     """Process a symbol called NAME, which is associated with REVISON.
 462
 463     REVISION is a canonical revision number with zeros removed, for
 464     example: '1.7', '1.7.2', or '1.1.1' or '1.1.1.1'.  NAME is a
 465     transformed branch or tag name."""
 466
 467     # Add symbol to our records:
 468     if is_branch_revision_number(revision):
 469       self._add_branch(name, revision)
 470     else:
 471       self._add_tag(name, revision)
 472
 473   def process_symbols(self):
 474     """Process the symbol definitions from SELF._symbol_defs."""
 475
 476     symbol_defs = self._symbol_defs
 477     del self._symbol_defs
 478
 479     symbol_defs = self._eliminate_trivial_duplicate_defs(symbol_defs)
 480     symbol_defs = self._process_duplicate_defs(symbol_defs)
 481
 482     for (name, revision) in symbol_defs:
 483       self._defined_symbols.add(name)
 484       self._process_symbol(name, revision)
 485
 486   @staticmethod
 487   def rev_to_branch_number(revision):
 488     """Return the branch_number of the branch on which REVISION lies.
 489
 490     REVISION is a branch revision number with an even number of
 491     components; for example '1.7.2.1' (never '1.7.2' nor '1.7.0.2').
 492     The return value is the branch number (for example, '1.7.2').
 493     Return none iff REVISION is a trunk revision such as '1.2'."""
 494
 495     if is_trunk_revision(revision):
 496       return None
 497     return revision[:revision.rindex(".")]
 498
 499   def rev_to_branch_data(self, revision):
 500     """Return the branch_data of the branch on which REVISION lies.
 501
 502     REVISION must be a branch revision number with an even number of
 503     components; for example '1.7.2.1' (never '1.7.2' nor '1.7.0.2').
 504     Raise KeyError iff REVISION is unknown."""
 505
 506     assert not is_trunk_revision(revision)
 507
 508     return self.branches_data[self.rev_to_branch_number(revision)]
 509
 510   def rev_to_lod(self, revision):
 511     """Return the line of development on which REVISION lies.
 512
 513     REVISION must be a revision number with an even number of
 514     components.  Raise KeyError iff REVISION is unknown."""
 515
 516     if is_trunk_revision(revision):
 517       return self.pdc.trunk
 518     else:
 519       return self.rev_to_branch_data(revision).symbol
 520
 521
 522 class _FileDataCollector(Sink):
 523   """Class responsible for collecting RCS data for a particular file.
 524
 525   Any collected data that need to be remembered are stored into the
 526   referenced CollectData instance."""
 527
 528   def __init__(self, pdc, cvs_file):
 529     """Create an object that is prepared to receive data for CVS_FILE.
 530     CVS_FILE is a CVSFile instance.  COLLECT_DATA is used to store the
 531     information collected about the file."""
 532
 533     self.pdc = pdc
 534     self.cvs_file = cvs_file
 535
 536     self.collect_data = self.pdc.collect_data
 537     self.project = self.cvs_file.project
 538
 539     # A place to store information about the symbols in this file:
 540     self.sdc = _SymbolDataCollector(self, self.cvs_file)
 541
 542     # { revision : _RevisionData instance }
 543     self._rev_data = { }
 544
 545     # Lists [ (parent, child) ] of revision number pairs indicating
 546     # that revision child depends on revision parent along the main
 547     # line of development.
 548     self._primary_dependencies = []
 549
 550     # If set, this is an RCS branch number -- rcsparse calls this the
 551     # "principal branch", but CVS and RCS refer to it as the "default
 552     # branch", so that's what we call it, even though the rcsparse API
 553     # setter method is still 'set_principal_branch'.
 554     self.default_branch = None
 555
 556     # True iff revision 1.1 of the file appears to have been imported
 557     # (as opposed to added normally).
 558     self._file_imported = False
 559
 560   def _get_rev_id(self, revision):
 561     if revision is None:
 562       return None
 563     return self._rev_data[revision].cvs_rev_id
 564
 565   def set_principal_branch(self, branch):
 566     """This is a callback method declared in Sink."""
 567
 568     if branch.find('.') == -1:
 569       # This just sets the default branch to trunk.  Normally this
 570       # shouldn't occur, but it has been seen in at least one CVS
 571       # repository.  Just ignore it.
 572       return
 573
 574     m = _branch_revision_re.match(branch)
 575     if not m:
 576       self.collect_data.record_fatal_error(
 577           'The default branch %s in file %r is not a valid branch number'
 578           % (branch, self.cvs_file.rcs_path,)
 579           )
 580       return
 581
 582     branch = m.group(1) + m.group(2)
 583     if branch.count('.') != 2:
 584       # We don't know how to deal with a non-top-level default
 585       # branch (what does CVS do?).  So if this case is detected,
 586       # punt:
 587       self.collect_data.record_fatal_error(
 588           'The default branch %s in file %r is not a top-level branch'
 589           % (branch, self.cvs_file.rcs_path,)
 590           )
 591       return
 592
 593     self.default_branch = branch
 594
 595   def define_tag(self, name, revision):
 596     """Remember the symbol name and revision, but don't process them yet.
 597
 598     This is a callback method declared in Sink."""
 599
 600     self.sdc.define_symbol(name, revision)
 601
 602   def set_expansion(self, mode):
 603     """This is a callback method declared in Sink."""
 604
 605     self.cvs_file.mode = mode
 606
 607   def admin_completed(self):
 608     """This is a callback method declared in Sink."""
 609
 610     self.sdc.process_symbols()
 611
 612   def define_revision(self, revision, timestamp, author, state,
 613                       branches, next):
 614     """This is a callback method declared in Sink."""
 615
 616     for branch in branches:
 617       try:
 618         branch_data = self.sdc.rev_to_branch_data(branch)
 619       except KeyError:
 620         # Normally we learn about the branches from the branch names
 621         # and numbers parsed from the symbolic name header.  But this
 622         # must have been an unlabeled branch that slipped through the
 623         # net.  Generate a name for it and create a _BranchData record
 624         # for it now.
 625         branch_data = self.sdc._add_unlabeled_branch(
 626             self.sdc.rev_to_branch_number(branch))
 627
 628       assert branch_data.child is None
 629       branch_data.child = branch
 630
 631     if revision in self._rev_data:
 632       # This revision has already been seen.
 633       logger.error('File %r contains duplicate definitions of revision %s.'
 634                   % (self.cvs_file.rcs_path, revision,))
 635       raise RuntimeError()
 636
 637     # Record basic information about the revision:
 638     rev_data = _RevisionData(
 639         self.collect_data.item_key_generator.gen_id(),
 640         revision, int(timestamp), author, state)
 641     self._rev_data[revision] = rev_data
 642
 643     # When on trunk, the RCS 'next' revision number points to what
 644     # humans might consider to be the 'previous' revision number.  For
 645     # example, 1.3's RCS 'next' is 1.2.
 646     #
 647     # However, on a branch, the RCS 'next' revision number really does
 648     # point to what humans would consider to be the 'next' revision
 649     # number.  For example, 1.1.2.1's RCS 'next' would be 1.1.2.2.
 650     #
 651     # In other words, in RCS, 'next' always means "where to find the next
 652     # deltatext that you need this revision to retrieve.
 653     #
 654     # That said, we don't *want* RCS's behavior here, so we determine
 655     # whether we're on trunk or a branch and set the dependencies
 656     # accordingly.
 657     if next:
 658       if is_trunk_revision(revision):
 659         self._primary_dependencies.append( (next, revision,) )
 660       else:
 661         self._primary_dependencies.append( (revision, next,) )
 662
 663   def tree_completed(self):
 664     """The revision tree has been parsed.
 665
 666     Analyze it for consistency and connect some loose ends.
 667
 668     This is a callback method declared in Sink."""
 669
 670     self._resolve_primary_dependencies()
 671     self._resolve_branch_dependencies()
 672     self._sort_branches()
 673     self._resolve_tag_dependencies()
 674
 675     # Compute the preliminary CVSFileItems for this file:
 676     cvs_items = []
 677     cvs_items.extend(self._get_cvs_revisions())
 678     cvs_items.extend(self._get_cvs_branches())
 679     cvs_items.extend(self._get_cvs_tags())
 680     self._cvs_file_items = CVSFileItems(
 681         self.cvs_file, self.pdc.trunk, cvs_items
 682         )
 683
 684     self._cvs_file_items.check_link_consistency()
 685
 686   def _resolve_primary_dependencies(self):
 687     """Resolve the dependencies listed in self._primary_dependencies."""
 688
 689     for (parent, child,) in self._primary_dependencies:
 690       parent_data = self._rev_data[parent]
 691       assert parent_data.child is None
 692       parent_data.child = child
 693
 694       child_data = self._rev_data[child]
 695       assert child_data.parent is None
 696       child_data.parent = parent
 697
 698   def _resolve_branch_dependencies(self):
 699     """Resolve dependencies involving branches."""
 700
 701     for branch_data in self.sdc.branches_data.values():
 702       # The branch_data's parent has the branch as a child regardless
 703       # of whether the branch had any subsequent commits:
 704       try:
 705         parent_data = self._rev_data[branch_data.parent]
 706       except KeyError:
 707         logger.warn(
 708             'In %r:\n'
 709             '    branch %r references non-existing revision %s\n'
 710             '    and will be ignored.'
 711             % (self.cvs_file.rcs_path, branch_data.symbol.name,
 712                branch_data.parent,))
 713         del self.sdc.branches_data[branch_data.branch_number]
 714       else:
 715         parent_data.branches_data.append(branch_data)
 716
 717         # If the branch has a child (i.e., something was committed on
 718         # the branch), then we store a reference to the branch_data
 719         # there, define the child's parent to be the branch's parent,
 720         # and list the child in the branch parent's branches_revs_data:
 721         if branch_data.child is not None:
 722           child_data = self._rev_data[branch_data.child]
 723           assert child_data.parent_branch_data is None
 724           child_data.parent_branch_data = branch_data
 725           assert child_data.parent is None
 726           child_data.parent = branch_data.parent
 727           parent_data.branches_revs_data.append(branch_data.child)
 728
 729   def _sort_branches(self):
 730     """Sort the branches sprouting from each revision in creation order.
 731
 732     Creation order is taken to be the reverse of the order that they
 733     are listed in the symbols part of the RCS file.  (If a branch is
 734     created then deleted, a later branch can be assigned the recycled
 735     branch number; therefore branch numbers are not an indication of
 736     creation order.)"""
 737
 738     for rev_data in self._rev_data.values():
 739       rev_data.branches_data.sort(lambda a, b: - cmp(a.id, b.id))
 740
 741   def _resolve_tag_dependencies(self):
 742     """Resolve dependencies involving tags."""
 743
 744     for (rev, tag_data_list) in self.sdc.tags_data.items():
 745       try:
 746         parent_data = self._rev_data[rev]
 747       except KeyError:
 748         logger.warn(
 749             'In %r:\n'
 750             '    the following tag(s) reference non-existing revision %s\n'
 751             '    and will be ignored:\n'
 752             '    %s' % (
 753                 self.cvs_file.rcs_path, rev,
 754                 ', '.join([repr(tag_data.symbol.name)
 755                            for tag_data in tag_data_list]),))
 756         del self.sdc.tags_data[rev]
 757       else:
 758         for tag_data in tag_data_list:
 759           assert tag_data.rev == rev
 760           # The tag_data's rev has the tag as a child:
 761           parent_data.tags_data.append(tag_data)
 762
 763   def _get_cvs_branches(self):
 764     """Generate the CVSBranches present in this file."""
 765
 766     for branch_data in self.sdc.branches_data.values():
 767       yield CVSBranch(
 768           branch_data.id, self.cvs_file, branch_data.symbol,
 769           branch_data.branch_number,
 770           self.sdc.rev_to_lod(branch_data.parent),
 771           self._get_rev_id(branch_data.parent),
 772           self._get_rev_id(branch_data.child),
 773           None,
 774           )
 775
 776   def _get_cvs_tags(self):
 777     """Generate the CVSTags present in this file."""
 778
 779     for tags_data in self.sdc.tags_data.values():
 780       for tag_data in tags_data:
 781         yield CVSTag(
 782             tag_data.id, self.cvs_file, tag_data.symbol,
 783             self.sdc.rev_to_lod(tag_data.rev),
 784             self._get_rev_id(tag_data.rev),
 785             None,
 786             )
 787
 788   def set_description(self, description):
 789     """This is a callback method declared in Sink."""
 790
 791     self.cvs_file.description = description
 792     self.cvs_file.determine_file_properties(Ctx().file_property_setters)
 793
 794   def set_revision_info(self, revision, log, text):
 795     """This is a callback method declared in Sink."""
 796
 797     rev_data = self._rev_data[revision]
 798     cvs_rev = self._cvs_file_items[rev_data.cvs_rev_id]
 799
 800     if cvs_rev.metadata_id is not None:
 801       # Users have reported problems with repositories in which the
 802       # deltatext block for revision 1.1 appears twice.  It is not
 803       # known whether this results from a CVS/RCS bug, or from botched
 804       # hand-editing of the repository.  In any case, empirically, cvs
 805       # and rcs both use the first version when checking out data, so
 806       # that's what we will do.  (For the record: "cvs log" fails on
 807       # such a file; "rlog" prints the log message from the first
 808       # block and ignores the second one.)
 809       logger.warn(
 810           "%s: in '%s':\n"
 811           "   Deltatext block for revision %s appeared twice;\n"
 812           "   ignoring the second occurrence.\n"
 813           % (warning_prefix, self.cvs_file.rcs_path, revision,)
 814           )
 815       return
 816
 817     if is_trunk_revision(revision):
 818       branch_name = None
 819     else:
 820       branch_name = self.sdc.rev_to_branch_data(revision).symbol.name
 821
 822     cvs_rev.metadata_id = self.collect_data.metadata_logger.store(
 823         self.project, branch_name, rev_data.author, log
 824         )
 825     cvs_rev.deltatext_exists = bool(text)
 826
 827     # If this is revision 1.1, determine whether the file appears to
 828     # have been created via 'cvs add' instead of 'cvs import'.  The
 829     # test is that the log message CVS uses for 1.1 in imports is
 830     # "Initial revision\n" with no period.  (This fact helps determine
 831     # whether this file might have had a default branch in the past.)
 832     if revision == '1.1':
 833       self._file_imported = (log == 'Initial revision\n')
 834
 835   def parse_completed(self):
 836     """Finish the processing of this file.
 837
 838     This is a callback method declared in Sink."""
 839
 840     # Make sure that there was an info section for each revision:
 841     for cvs_item in self._cvs_file_items.values():
 842       if isinstance(cvs_item, CVSRevision) and cvs_item.metadata_id is None:
 843         self.collect_data.record_fatal_error(
 844             '%r has no deltatext section for revision %s'
 845             % (self.cvs_file.rcs_path, cvs_item.rev,)
 846             )
 847
 848   def _determine_operation(self, rev_data):
 849     prev_rev_data = self._rev_data.get(rev_data.parent)
 850     return cvs_revision_type_map[(
 851         rev_data.state != 'dead',
 852         prev_rev_data is not None and prev_rev_data.state != 'dead',
 853         )]
 854
 855   def _get_cvs_revisions(self):
 856     """Generate the CVSRevisions present in this file."""
 857
 858     for rev_data in self._rev_data.itervalues():
 859       yield self._get_cvs_revision(rev_data)
 860
 861   def _get_cvs_revision(self, rev_data):
 862     """Create and return a CVSRevision for REV_DATA."""
 863
 864     branch_ids = [
 865         branch_data.id
 866         for branch_data in rev_data.branches_data
 867         ]
 868
 869     branch_commit_ids = [
 870         self._get_rev_id(rev)
 871         for rev in rev_data.branches_revs_data
 872         ]
 873
 874     tag_ids = [
 875         tag_data.id
 876         for tag_data in rev_data.tags_data
 877         ]
 878
 879     revision_type = self._determine_operation(rev_data)
 880
 881     return revision_type(
 882         self._get_rev_id(rev_data.rev), self.cvs_file,
 883         rev_data.timestamp, None,
 884         self._get_rev_id(rev_data.parent),
 885         self._get_rev_id(rev_data.child),
 886         rev_data.rev,
 887         True,
 888         self.sdc.rev_to_lod(rev_data.rev),
 889         rev_data.get_first_on_branch_id(),
 890         False, None, None,
 891         tag_ids, branch_ids, branch_commit_ids,
 892         rev_data.revision_reader_token
 893         )
 894
 895   def get_cvs_file_items(self):
 896     """Finish up and return a CVSFileItems instance for this file.
 897
 898     This method must only be called once."""
 899
 900     self._process_ntdbrs()
 901
 902     # Break a circular reference loop, allowing the memory for self
 903     # and sdc to be freed.
 904     del self.sdc
 905
 906     return self._cvs_file_items
 907
 908   def _process_ntdbrs(self):
 909     """Fix up any non-trunk default branch revisions (if present).
 910
 911     If a non-trunk default branch is determined to have existed, yield
 912     the _RevisionData.ids for all revisions that were once non-trunk
 913     default revisions, in dependency order.
 914
 915     There are two cases to handle:
 916
 917     One case is simple.  The RCS file lists a default branch
 918     explicitly in its header, such as '1.1.1'.  In this case, we know
 919     that every revision on the vendor branch is to be treated as head
 920     of trunk at that point in time.
 921
 922     But there's also a degenerate case.  The RCS file does not
 923     currently have a default branch, yet we can deduce that for some
 924     period in the past it probably *did* have one.  For example, the
 925     file has vendor revisions 1.1.1.1 -> 1.1.1.96, all of which are
 926     dated before 1.2, and then it has 1.1.1.97 -> 1.1.1.100 dated
 927     after 1.2.  In this case, we should record 1.1.1.96 as the last
 928     vendor revision to have been the head of the default branch.
 929
 930     If any non-trunk default branch revisions are found:
 931
 932     - Set their ntdbr members to True.
 933
 934     - Connect the last one with revision 1.2.
 935
 936     - Remove revision 1.1 if it is not needed.
 937
 938     """
 939
 940     try:
 941       if self.default_branch:
 942         try:
 943           vendor_cvs_branch_id = self.sdc.branches_data[self.default_branch].id
 944         except KeyError:
 945           logger.warn(
 946               '%s: In %s:\n'
 947               '    vendor branch %r is not present in file and will be ignored.'
 948               % (warning_prefix, self.cvs_file.rcs_path, self.default_branch,)
 949               )
 950           self.default_branch = None
 951           return
 952         vendor_lod_items = self._cvs_file_items.get_lod_items(
 953             self._cvs_file_items[vendor_cvs_branch_id]
 954             )
 955         if not self._cvs_file_items.process_live_ntdb(vendor_lod_items):
 956           return
 957       elif self._file_imported:
 958         vendor_branch_data = self.sdc.branches_data.get('1.1.1')
 959         if vendor_branch_data is None:
 960           return
 961         else:
 962           vendor_lod_items = self._cvs_file_items.get_lod_items(
 963               self._cvs_file_items[vendor_branch_data.id]
 964               )
 965           if not self._cvs_file_items.process_historical_ntdb(
 966                 vendor_lod_items
 967                 ):
 968             return
 969       else:
 970         return
 971     except VendorBranchError, e:
 972       self.collect_data.record_fatal_error(str(e))
 973       return
 974
 975     if self._file_imported:
 976       self._cvs_file_items.imported_remove_1_1(vendor_lod_items)
 977
 978     self._cvs_file_items.check_link_consistency()
 979
 980
 981 class _ProjectDataCollector:
 982   def __init__(self, collect_data, project):
 983     self.collect_data = collect_data
 984     self.project = project
 985     self.num_files = 0
 986
 987     # The Trunk LineOfDevelopment object for this project:
 988     self.trunk = Trunk(
 989         self.collect_data.symbol_key_generator.gen_id(), self.project
 990         )
 991     self.project.trunk_id = self.trunk.id
 992
 993     # This causes a record for self.trunk to spring into existence:
 994     self.collect_data.register_trunk(self.trunk)
 995
 996     # A map { name -> Symbol } for all known symbols in this project.
 997     # The symbols listed here are undifferentiated into Branches and
 998     # Tags because the same name might appear as a branch in one file
 999     # and a tag in another.
1000     self.symbols = {}
1001
1002     # A map { (old_name, new_name) : count } indicating how many files
1003     # were affected by each each symbol name transformation:
1004     self.symbol_transform_counts = {}
1005
1006   def get_symbol(self, name):
1007     """Return the Symbol object for the symbol named NAME in this project.
1008
1009     If such a symbol does not yet exist, allocate a new symbol_id,
1010     create a Symbol instance, store it in self.symbols, and return it."""
1011
1012     symbol = self.symbols.get(name)
1013     if symbol is None:
1014       symbol = Symbol(
1015           self.collect_data.symbol_key_generator.gen_id(),
1016           self.project, name)
1017       self.symbols[name] = symbol
1018     return symbol
1019
1020   def log_symbol_transform(self, old_name, new_name):
1021     """Record that OLD_NAME was transformed to NEW_NAME in one file.
1022
1023     This information is used to generated a statistical summary of
1024     symbol transforms."""
1025
1026     try:
1027       self.symbol_transform_counts[old_name, new_name] += 1
1028     except KeyError:
1029       self.symbol_transform_counts[old_name, new_name] = 1
1030
1031   def summarize_symbol_transforms(self):
1032     if self.symbol_transform_counts and logger.is_on(logger.NORMAL):
1033       logger.normal('Summary of symbol transforms:')
1034       transforms = self.symbol_transform_counts.items()
1035       transforms.sort()
1036       for ((old_name, new_name), count) in transforms:
1037         if new_name is None:
1038           logger.normal('    "%s" ignored in %d files' % (old_name, count,))
1039         else:
1040           logger.normal(
1041               '    "%s" transformed to "%s" in %d files'
1042               % (old_name, new_name, count,)
1043               )
1044
1045   def process_file(self, cvs_file):
1046     logger.normal(cvs_file.rcs_path)
1047     fdc = _FileDataCollector(self, cvs_file)
1048     try:
1049       parse(open(cvs_file.rcs_path, 'rb'), fdc)
1050     except (RCSParseError, RuntimeError):
1051       self.collect_data.record_fatal_error(
1052           "%r is not a valid ,v file" % (cvs_file.rcs_path,)
1053           )
1054       # Abort the processing of this file, but let the pass continue
1055       # with other files:
1056       return
1057     except ValueError, e:
1058       self.collect_data.record_fatal_error(
1059           "%r is not a valid ,v file (%s)" % (cvs_file.rcs_path, str(e),)
1060           )
1061       # Abort the processing of this file, but let the pass continue
1062       # with other files:
1063       return
1064     except:
1065       logger.warn("Exception occurred while parsing %s" % cvs_file.rcs_path)
1066       raise
1067     else:
1068       self.num_files += 1
1069
1070     return fdc.get_cvs_file_items()
1071
1072
1073 class CollectData:
1074   """Repository for data collected by parsing the CVS repository files.
1075
1076   This class manages the databases into which information collected
1077   from the CVS repository is stored.  The data are stored into this
1078   class by _FileDataCollector instances, one of which is created for
1079   each file to be parsed."""
1080
1081   def __init__(self, stats_keeper):
1082     self._cvs_item_store = NewCVSItemStore(
1083         artifact_manager.get_temp_file(config.CVS_ITEMS_STORE))
1084     self.metadata_db = MetadataDatabase(
1085         artifact_manager.get_temp_file(config.METADATA_STORE),
1086         artifact_manager.get_temp_file(config.METADATA_INDEX_TABLE),
1087         DB_OPEN_NEW,
1088         )
1089     self.metadata_logger = MetadataLogger(self.metadata_db)
1090     self.fatal_errors = []
1091     self.num_files = 0
1092     self.symbol_stats = SymbolStatisticsCollector()
1093     self.stats_keeper = stats_keeper
1094
1095     # Key generator for CVSItems:
1096     self.item_key_generator = KeyGenerator()
1097
1098     # Key generator for Symbols:
1099     self.symbol_key_generator = KeyGenerator()
1100
1101   def record_fatal_error(self, err):
1102     """Record that fatal error ERR was found.
1103
1104     ERR is a string (without trailing newline) describing the error.
1105     Output the error to stderr immediately, and record a copy to be
1106     output again in a summary at the end of CollectRevsPass."""
1107
1108     err = '%s: %s' % (error_prefix, err,)
1109     logger.error(err + '\n')
1110     self.fatal_errors.append(err)
1111
1112   def add_cvs_directory(self, cvs_directory):
1113     """Record CVS_DIRECTORY."""
1114
1115     Ctx()._cvs_path_db.log_path(cvs_directory)
1116
1117   def add_cvs_file_items(self, cvs_file_items):
1118     """Record the information from CVS_FILE_ITEMS.
1119
1120     Store the CVSFile to _cvs_path_db under its persistent id, store
1121     the CVSItems, and record the CVSItems to self.stats_keeper."""
1122
1123     Ctx()._cvs_path_db.log_path(cvs_file_items.cvs_file)
1124     self._cvs_item_store.add(cvs_file_items)
1125
1126     self.stats_keeper.record_cvs_file(cvs_file_items.cvs_file)
1127     for cvs_item in cvs_file_items.values():
1128       self.stats_keeper.record_cvs_item(cvs_item)
1129
1130   def register_trunk(self, trunk):
1131     """Create a symbol statistics record for the specified trunk LOD."""
1132
1133     # This causes a record to spring into existence:
1134     self.symbol_stats[trunk]
1135
1136   def _process_cvs_file_items(self, cvs_file_items):
1137     """Process the CVSFileItems from one CVSFile."""
1138
1139     # Remove an initial delete on trunk if it is not needed:
1140     cvs_file_items.remove_unneeded_initial_trunk_delete(self.metadata_db)
1141
1142     # Remove initial branch deletes that are not needed:
1143     cvs_file_items.remove_initial_branch_deletes(self.metadata_db)
1144
1145     # If this is a --trunk-only conversion, discard all branches and
1146     # tags, then draft any non-trunk default branch revisions to
1147     # trunk:
1148     if Ctx().trunk_only:
1149       cvs_file_items.exclude_non_trunk()
1150
1151     cvs_file_items.check_link_consistency()
1152
1153     self.add_cvs_file_items(cvs_file_items)
1154     self.symbol_stats.register(cvs_file_items)
1155
1156   def process_project(self, project, cvs_paths):
1157     pdc = _ProjectDataCollector(self, project)
1158
1159     found_rcs_file = False
1160     for cvs_path in cvs_paths:
1161       if isinstance(cvs_path, CVSDirectory):
1162         self.add_cvs_directory(cvs_path)
1163       else:
1164         cvs_file_items = pdc.process_file(cvs_path)
1165         self._process_cvs_file_items(cvs_file_items)
1166         found_rcs_file = True
1167
1168     if not found_rcs_file:
1169       self.record_fatal_error(
1170           'No RCS files found under %r!\n'
1171           'Are you absolutely certain you are pointing cvs2svn\n'
1172           'at a CVS repository?\n'
1173           % (project.project_cvs_repos_path,)
1174           )
1175
1176     pdc.summarize_symbol_transforms()
1177
1178     self.num_files += pdc.num_files
1179     logger.verbose('Processed', self.num_files, 'files')
1180
1181   def _register_empty_subdirectories(self):
1182     """Set the CVSDirectory.empty_subdirectory_id members."""
1183
1184     directories = set(
1185         path
1186         for path in Ctx()._cvs_path_db.itervalues()
1187         if isinstance(path, CVSDirectory)
1188         )
1189     for path in Ctx()._cvs_path_db.itervalues():
1190       if isinstance(path, CVSFile):
1191         directory = path.parent_directory
1192         while directory is not None and directory in directories:
1193           directories.remove(directory)
1194           directory = directory.parent_directory
1195     for directory in directories:
1196       if directory.parent_directory is not None:
1197         directory.parent_directory.empty_subdirectory_ids.append(directory.id)
1198
1199   def close(self):
1200     """Close the data structures associated with this instance.
1201
1202     Return a list of fatal errors encountered while processing input.
1203     Each list entry is a string describing one fatal error."""
1204
1205     self.symbol_stats.purge_ghost_symbols()
1206     self.symbol_stats.close()
1207     self.symbol_stats = None
1208     self.metadata_logger = None
1209     self.metadata_db.close()
1210     self.metadata_db = None
1211     self._cvs_item_store.close()
1212     self._cvs_item_store = None
1213     self._register_empty_subdirectories()
1214     retval = self.fatal_errors
1215     self.fatal_errors = None
1216     return retval
1217
1218