Update comment (there is no longer a class called RevisionManager).
[cvs2svn.git] / cvs2svn_lib / collect_data.py
blobf8ade599b3382ed2e6e103f5ffcc6122b7def907
1 # (Be in -*- python -*- mode.)
3 # ====================================================================
4 # Copyright (c) 2000-2009 CollabNet. All rights reserved.
6 # This software is licensed as described in the file COPYING, which
7 # you should have received as part of this distribution. The terms
8 # are also available at http://subversion.tigris.org/license-1.html.
9 # If newer versions of this license are posted there, you may use a
10 # newer version instead, at your option.
12 # This software consists of voluntary contributions made by many
13 # individuals. For exact contribution history, see the revision
14 # history and logs, available at http://cvs2svn.tigris.org/.
15 # ====================================================================
17 """Data collection classes.
19 This module contains the code used to collect data from the CVS
20 repository. It parses *,v files, recording all useful information
21 except for the actual file contents.
23 As a *,v file is parsed, the information pertaining to the file is
24 accumulated in memory, mostly in _RevisionData, _BranchData, and
25 _TagData objects. When parsing is complete, a final pass is made over
26 the data to create some final dependency links, collect statistics,
27 etc., then the _*Data objects are converted into CVSItem objects
28 (CVSRevision, CVSBranch, and CVSTag respectively) and the CVSItems are
29 dumped into databases.
31 During the data collection, persistent unique ids are allocated to
32 many types of objects: CVSFile, Symbol, and CVSItems. CVSItems are a
33 special case. CVSItem ids are unique across all CVSItem types, and
34 the ids are carried over from the corresponding data collection
35 objects:
37 _RevisionData -> CVSRevision
39 _BranchData -> CVSBranch
41 _TagData -> CVSTag
43 In a later pass it is possible to convert tags <-> branches. But even
44 if this occurs, the new branch or tag uses the same id as the old tag
45 or branch.
47 """
50 import os
51 import stat
52 import re
54 from cvs2svn_lib import config
55 from cvs2svn_lib.common import DB_OPEN_NEW
56 from cvs2svn_lib.common import warning_prefix
57 from cvs2svn_lib.common import error_prefix
58 from cvs2svn_lib.common import is_trunk_revision
59 from cvs2svn_lib.common import is_branch_revision_number
60 from cvs2svn_lib.log import Log
61 from cvs2svn_lib.context import Ctx
62 from cvs2svn_lib.artifact_manager import artifact_manager
63 from cvs2svn_lib.cvs_path import CVSPath
64 from cvs2svn_lib.cvs_path import CVSFile
65 from cvs2svn_lib.cvs_path import CVSDirectory
66 from cvs2svn_lib.symbol import Symbol
67 from cvs2svn_lib.symbol import Trunk
68 from cvs2svn_lib.cvs_item import CVSRevision
69 from cvs2svn_lib.cvs_item import CVSBranch
70 from cvs2svn_lib.cvs_item import CVSTag
71 from cvs2svn_lib.cvs_item import cvs_revision_type_map
72 from cvs2svn_lib.cvs_file_items import VendorBranchError
73 from cvs2svn_lib.cvs_file_items import CVSFileItems
74 from cvs2svn_lib.key_generator import KeyGenerator
75 from cvs2svn_lib.cvs_item_database import NewCVSItemStore
76 from cvs2svn_lib.symbol_statistics import SymbolStatisticsCollector
77 from cvs2svn_lib.metadata_database import MetadataDatabase
78 from cvs2svn_lib.metadata_database import MetadataLogger
79 from cvs2svn_lib.repository_walker import walk_repository
81 import cvs2svn_rcsparse
84 # A regular expression defining "valid" revision numbers (used to
85 # check that symbol definitions are reasonable).
86 _valid_revision_re = re.compile(r'''
88 (?:\d+\.)+ # Digit groups with trailing dots
89 \d+ # And the last digit group.
91 ''', re.VERBOSE)
93 _branch_revision_re = re.compile(r'''
95 ((?:\d+\.\d+\.)+) # A nonzero even number of digit groups w/trailing dot
96 (?:0\.)? # CVS sticks an extra 0 here; RCS does not
97 (\d+) # And the last digit group
99 ''', re.VERBOSE)
102 def is_same_line_of_development(rev1, rev2):
103 """Return True if rev1 and rev2 are on the same line of
104 development (i.e., both on trunk, or both on the same branch);
105 return False otherwise. Either rev1 or rev2 can be None, in
106 which case automatically return False."""
108 if rev1 is None or rev2 is None:
109 return False
110 if is_trunk_revision(rev1) and is_trunk_revision(rev2):
111 # Trunk revisions have to be handled specially because the main
112 # trunk version number can be changed; e.g., from 1 to 2.
113 return True
114 if rev1[0:rev1.rfind('.')] == rev2[0:rev2.rfind('.')]:
115 return True
116 return False
119 class _RevisionData:
120 """We track the state of each revision so that in set_revision_info,
121 we can determine if our op is an add/change/delete. We can do this
122 because in set_revision_info, we'll have all of the _RevisionData
123 for a file at our fingertips, and we need to examine the state of
124 our prev_rev to determine if we're an add or a change. Without the
125 state of the prev_rev, we are unable to distinguish between an add
126 and a change."""
128 def __init__(self, cvs_rev_id, rev, timestamp, author, state):
129 # The id of this revision:
130 self.cvs_rev_id = cvs_rev_id
131 self.rev = rev
132 self.timestamp = timestamp
133 self.author = author
134 self.state = state
136 # If this is the first revision on a branch, then this is the
137 # branch_data of that branch; otherwise it is None.
138 self.parent_branch_data = None
140 # The revision number of the parent of this revision along the
141 # same line of development, if any. For the first revision R on a
142 # branch, we consider the revision from which R sprouted to be the
143 # 'parent'. If this is the root revision in the file's revision
144 # tree, then this field is None.
146 # Note that this revision can't be determined arithmetically (due
147 # to cvsadmin -o), which is why this field is necessary.
148 self.parent = None
150 # The revision number of the primary child of this revision (the
151 # child along the same line of development), if any; otherwise,
152 # None.
153 self.child = None
155 # The _BranchData instances of branches that sprout from this
156 # revision, sorted in ascending order by branch number. It would
157 # be inconvenient to initialize it here because we would have to
158 # scan through all branches known by the _SymbolDataCollector to
159 # find the ones having us as the parent. Instead, this
160 # information is filled in by
161 # _FileDataCollector._resolve_dependencies() and sorted by
162 # _FileDataCollector._sort_branches().
163 self.branches_data = []
165 # The revision numbers of the first commits on any branches on
166 # which commits occurred. This dependency is kept explicitly
167 # because otherwise a revision-only topological sort would miss
168 # the dependency that exists via branches_data.
169 self.branches_revs_data = []
171 # The _TagData instances of tags that are connected to this
172 # revision.
173 self.tags_data = []
175 # A token that may be set by a RevisionCollector, then used by
176 # RevisionReader to obtain the text again.
177 self.revision_reader_token = None
179 def get_first_on_branch_id(self):
180 return self.parent_branch_data and self.parent_branch_data.id
183 class _SymbolData:
184 """Collection area for information about a symbol in a single CVSFile.
186 SYMBOL is an instance of Symbol, undifferentiated as a Branch or a
187 Tag regardless of whether self is a _BranchData or a _TagData."""
189 def __init__(self, id, symbol):
190 """Initialize an object for SYMBOL."""
192 # The unique id that will be used for this particular symbol in
193 # this particular file. This same id will be used for the CVSItem
194 # that is derived from this instance.
195 self.id = id
197 # An instance of Symbol.
198 self.symbol = symbol
201 class _BranchData(_SymbolData):
202 """Collection area for information about a Branch in a single CVSFile."""
204 def __init__(self, id, symbol, branch_number):
205 _SymbolData.__init__(self, id, symbol)
207 # The branch number (e.g., '1.5.2') of this branch.
208 self.branch_number = branch_number
210 # The revision number of the revision from which this branch
211 # sprouts (e.g., '1.5').
212 self.parent = self.branch_number[:self.branch_number.rindex(".")]
214 # The revision number of the first commit on this branch, if any
215 # (e.g., '1.5.2.1'); otherwise, None.
216 self.child = None
219 class _TagData(_SymbolData):
220 """Collection area for information about a Tag in a single CVSFile."""
222 def __init__(self, id, symbol, rev):
223 _SymbolData.__init__(self, id, symbol)
225 # The revision number being tagged (e.g., '1.5.2.3').
226 self.rev = rev
229 class _SymbolDataCollector(object):
230 """Collect information about symbols in a single CVSFile."""
232 def __init__(self, fdc, cvs_file):
233 self.fdc = fdc
234 self.cvs_file = cvs_file
236 self.pdc = self.fdc.pdc
237 self.collect_data = self.fdc.collect_data
239 # A list [(name, revision), ...] of symbols defined in the header
240 # of the file. The name has already been transformed using the
241 # symbol transform rules. If the symbol transform rules indicate
242 # that the symbol should be ignored, then it is never added to
243 # this list. This list is processed then deleted in
244 # process_symbols().
245 self._symbol_defs = []
247 # A set containing the transformed names of symbols in this file
248 # (used to detect duplicates during processing of unlabeled
249 # branches):
250 self._defined_symbols = set()
252 # Map { branch_number : _BranchData }, where branch_number has an
253 # odd number of digits.
254 self.branches_data = { }
256 # Map { revision : [ tag_data ] }, where revision has an even
257 # number of digits, and the value is a list of _TagData objects
258 # for tags that apply to that revision.
259 self.tags_data = { }
261 def _add_branch(self, name, branch_number):
262 """Record that BRANCH_NUMBER is the branch number for branch NAME,
263 and derive and record the revision from which NAME sprouts.
264 BRANCH_NUMBER is an RCS branch number with an odd number of
265 components, for example '1.7.2' (never '1.7.0.2'). Return the
266 _BranchData instance (which is usually newly-created)."""
268 branch_data = self.branches_data.get(branch_number)
270 if branch_data is not None:
271 Log().warn(
272 "%s: in '%s':\n"
273 " branch '%s' already has name '%s',\n"
274 " cannot also have name '%s', ignoring the latter\n"
275 % (warning_prefix,
276 self.cvs_file.filename, branch_number,
277 branch_data.symbol.name, name)
279 return branch_data
281 symbol = self.pdc.get_symbol(name)
282 branch_data = _BranchData(
283 self.collect_data.item_key_generator.gen_id(), symbol, branch_number
285 self.branches_data[branch_number] = branch_data
286 return branch_data
288 def _construct_distinct_name(self, name, original_name):
289 """Construct a distinct symbol name from NAME.
291 If NAME is distinct, return it. If it is already used in this
292 file (as determined from its presence in self._defined_symbols),
293 construct and return a new name that is not already used."""
295 if name not in self._defined_symbols:
296 return name
297 else:
298 index = 1
299 while True:
300 dup_name = '%s-DUPLICATE-%d' % (name, index,)
301 if dup_name not in self._defined_symbols:
302 self.collect_data.record_fatal_error(
303 "Symbol name '%s' is already used in '%s'.\n"
304 "The unlabeled branch '%s' must be renamed using "
305 "--symbol-transform."
306 % (name, self.cvs_file.filename, original_name,)
308 return dup_name
310 def _add_unlabeled_branch(self, branch_number):
311 original_name = "unlabeled-" + branch_number
312 name = self.transform_symbol(original_name, branch_number)
313 if name is None:
314 self.collect_data.record_fatal_error(
315 "The unlabeled branch '%s' in '%s' contains commits.\n"
316 "It may not be ignored via a symbol transform. (Use --exclude "
317 "instead.)"
318 % (original_name, self.cvs_file.filename,)
320 # Retain the original name to allow the conversion to continue:
321 name = original_name
323 distinct_name = self._construct_distinct_name(name, original_name)
324 self._defined_symbols.add(distinct_name)
325 return self._add_branch(distinct_name, branch_number)
327 def _add_tag(self, name, revision):
328 """Record that tag NAME refers to the specified REVISION."""
330 symbol = self.pdc.get_symbol(name)
331 tag_data = _TagData(
332 self.collect_data.item_key_generator.gen_id(), symbol, revision
334 self.tags_data.setdefault(revision, []).append(tag_data)
335 return tag_data
337 def transform_symbol(self, name, revision):
338 """Transform a symbol according to the project's symbol transforms.
340 Transform the symbol with the original name NAME and canonicalized
341 revision number REVISION. Return the new symbol name or None if
342 the symbol should be ignored entirely.
344 Log the results of the symbol transform if necessary."""
346 old_name = name
347 # Apply any user-defined symbol transforms to the symbol name:
348 name = self.cvs_file.project.transform_symbol(
349 self.cvs_file, name, revision
352 if name is None:
353 # Ignore symbol:
354 self.pdc.log_symbol_transform(old_name, None)
355 Log().verbose(
356 " symbol '%s'=%s ignored in %s"
357 % (old_name, revision, self.cvs_file.filename,)
359 else:
360 if name != old_name:
361 self.pdc.log_symbol_transform(old_name, name)
362 Log().verbose(
363 " symbol '%s'=%s transformed to '%s' in %s"
364 % (old_name, revision, name, self.cvs_file.filename,)
367 return name
369 def define_symbol(self, name, revision):
370 """Record a symbol definition for later processing."""
372 # Canonicalize the revision number:
373 revision = _branch_revision_re.sub(r'\1\2', revision)
375 # Apply any user-defined symbol transforms to the symbol name:
376 name = self.transform_symbol(name, revision)
378 if name is not None:
379 # Verify that the revision number is valid:
380 if _valid_revision_re.match(revision):
381 # The revision number is valid; record it for later processing:
382 self._symbol_defs.append( (name, revision) )
383 else:
384 Log().warn(
385 'In %r:\n'
386 ' branch %r references invalid revision %s\n'
387 ' and will be ignored.'
388 % (self.cvs_file.filename, name, revision,)
391 def _eliminate_trivial_duplicate_defs(self, symbol_defs):
392 """Iterate through SYMBOL_DEFS, Removing identical duplicate definitions.
394 Duplicate definitions of symbol names have been seen in the wild,
395 and they can also happen when --symbol-transform is used. If a
396 symbol is defined to the same revision number repeatedly, then
397 ignore all but the last definition."""
399 # Make a copy, since we have to iterate through the definitions
400 # twice:
401 symbol_defs = list(symbol_defs)
403 # A map { (name, revision) : [index,...] } of the indexes where
404 # symbol definitions name=revision were found:
405 known_definitions = {}
406 for (i, symbol_def) in enumerate(symbol_defs):
407 known_definitions.setdefault(symbol_def, []).append(i)
409 # A set of the indexes of entries that have to be removed from
410 # symbol_defs:
411 dup_indexes = set()
412 for ((name, revision), indexes) in known_definitions.iteritems():
413 if len(indexes) > 1:
414 Log().verbose(
415 "in %r:\n"
416 " symbol %s:%s defined multiple times; ignoring duplicates\n"
417 % (self.cvs_file.filename, name, revision,)
419 dup_indexes.update(indexes[:-1])
421 for (i, symbol_def) in enumerate(symbol_defs):
422 if i not in dup_indexes:
423 yield symbol_def
425 def _process_duplicate_defs(self, symbol_defs):
426 """Iterate through SYMBOL_DEFS, processing duplicate names.
428 Duplicate definitions of symbol names have been seen in the wild,
429 and they can also happen when --symbol-transform is used. If a
430 symbol is defined multiple times, then it is a fatal error. This
431 method should be called after _eliminate_trivial_duplicate_defs()."""
433 # Make a copy, since we have to access multiple times:
434 symbol_defs = list(symbol_defs)
436 # A map {name : [index,...]} mapping the names of symbols to a
437 # list of their definitions' indexes in symbol_defs:
438 known_symbols = {}
439 for (i, (name, revision)) in enumerate(symbol_defs):
440 known_symbols.setdefault(name, []).append(i)
442 known_symbols = known_symbols.items()
443 known_symbols.sort()
444 dup_indexes = set()
445 for (name, indexes) in known_symbols:
446 if len(indexes) > 1:
447 # This symbol was defined multiple times.
448 self.collect_data.record_fatal_error(
449 "Multiple definitions of the symbol '%s' in '%s': %s" % (
450 name, self.cvs_file.filename,
451 ' '.join([symbol_defs[i][1] for i in indexes]),
454 # Ignore all but the last definition for now, to allow the
455 # conversion to proceed:
456 dup_indexes.update(indexes[:-1])
458 for (i, symbol_def) in enumerate(symbol_defs):
459 if i not in dup_indexes:
460 yield symbol_def
462 def _process_symbol(self, name, revision):
463 """Process a symbol called NAME, which is associated with REVISON.
465 REVISION is a canonical revision number with zeros removed, for
466 example: '1.7', '1.7.2', or '1.1.1' or '1.1.1.1'. NAME is a
467 transformed branch or tag name."""
469 # Add symbol to our records:
470 if is_branch_revision_number(revision):
471 self._add_branch(name, revision)
472 else:
473 self._add_tag(name, revision)
475 def process_symbols(self):
476 """Process the symbol definitions from SELF._symbol_defs."""
478 symbol_defs = self._symbol_defs
479 del self._symbol_defs
481 symbol_defs = self._eliminate_trivial_duplicate_defs(symbol_defs)
482 symbol_defs = self._process_duplicate_defs(symbol_defs)
484 for (name, revision) in symbol_defs:
485 self._defined_symbols.add(name)
486 self._process_symbol(name, revision)
488 @staticmethod
489 def rev_to_branch_number(revision):
490 """Return the branch_number of the branch on which REVISION lies.
492 REVISION is a branch revision number with an even number of
493 components; for example '1.7.2.1' (never '1.7.2' nor '1.7.0.2').
494 The return value is the branch number (for example, '1.7.2').
495 Return none iff REVISION is a trunk revision such as '1.2'."""
497 if is_trunk_revision(revision):
498 return None
499 return revision[:revision.rindex(".")]
501 def rev_to_branch_data(self, revision):
502 """Return the branch_data of the branch on which REVISION lies.
504 REVISION must be a branch revision number with an even number of
505 components; for example '1.7.2.1' (never '1.7.2' nor '1.7.0.2').
506 Raise KeyError iff REVISION is unknown."""
508 assert not is_trunk_revision(revision)
510 return self.branches_data[self.rev_to_branch_number(revision)]
512 def rev_to_lod(self, revision):
513 """Return the line of development on which REVISION lies.
515 REVISION must be a revision number with an even number of
516 components. Raise KeyError iff REVISION is unknown."""
518 if is_trunk_revision(revision):
519 return self.pdc.trunk
520 else:
521 return self.rev_to_branch_data(revision).symbol
524 class _FileDataCollector(cvs2svn_rcsparse.Sink):
525 """Class responsible for collecting RCS data for a particular file.
527 Any collected data that need to be remembered are stored into the
528 referenced CollectData instance."""
530 def __init__(self, pdc, cvs_file):
531 """Create an object that is prepared to receive data for CVS_FILE.
532 CVS_FILE is a CVSFile instance. COLLECT_DATA is used to store the
533 information collected about the file."""
535 self.pdc = pdc
536 self.cvs_file = cvs_file
538 self.collect_data = self.pdc.collect_data
539 self.project = self.cvs_file.project
541 # A place to store information about the symbols in this file:
542 self.sdc = _SymbolDataCollector(self, self.cvs_file)
544 # { revision : _RevisionData instance }
545 self._rev_data = { }
547 # Lists [ (parent, child) ] of revision number pairs indicating
548 # that revision child depends on revision parent along the main
549 # line of development.
550 self._primary_dependencies = []
552 # If set, this is an RCS branch number -- rcsparse calls this the
553 # "principal branch", but CVS and RCS refer to it as the "default
554 # branch", so that's what we call it, even though the rcsparse API
555 # setter method is still 'set_principal_branch'.
556 self.default_branch = None
558 # True iff revision 1.1 of the file appears to have been imported
559 # (as opposed to added normally).
560 self._file_imported = False
562 def _get_rev_id(self, revision):
563 if revision is None:
564 return None
565 return self._rev_data[revision].cvs_rev_id
567 def set_principal_branch(self, branch):
568 """This is a callback method declared in Sink."""
570 if branch.find('.') == -1:
571 # This just sets the default branch to trunk. Normally this
572 # shouldn't occur, but it has been seen in at least one CVS
573 # repository. Just ignore it.
574 pass
575 else:
576 self.default_branch = branch
578 def define_tag(self, name, revision):
579 """Remember the symbol name and revision, but don't process them yet.
581 This is a callback method declared in Sink."""
583 self.sdc.define_symbol(name, revision)
585 def set_expansion(self, mode):
586 """This is a callback method declared in Sink."""
588 self.cvs_file.mode = mode
590 def admin_completed(self):
591 """This is a callback method declared in Sink."""
593 self.sdc.process_symbols()
595 def define_revision(self, revision, timestamp, author, state,
596 branches, next):
597 """This is a callback method declared in Sink."""
599 for branch in branches:
600 try:
601 branch_data = self.sdc.rev_to_branch_data(branch)
602 except KeyError:
603 # Normally we learn about the branches from the branch names
604 # and numbers parsed from the symbolic name header. But this
605 # must have been an unlabeled branch that slipped through the
606 # net. Generate a name for it and create a _BranchData record
607 # for it now.
608 branch_data = self.sdc._add_unlabeled_branch(
609 self.sdc.rev_to_branch_number(branch))
611 assert branch_data.child is None
612 branch_data.child = branch
614 if revision in self._rev_data:
615 # This revision has already been seen.
616 Log().error('File %r contains duplicate definitions of revision %s.'
617 % (self.cvs_file.filename, revision,))
618 raise RuntimeError
620 # Record basic information about the revision:
621 rev_data = _RevisionData(
622 self.collect_data.item_key_generator.gen_id(),
623 revision, int(timestamp), author, state)
624 self._rev_data[revision] = rev_data
626 # When on trunk, the RCS 'next' revision number points to what
627 # humans might consider to be the 'previous' revision number. For
628 # example, 1.3's RCS 'next' is 1.2.
630 # However, on a branch, the RCS 'next' revision number really does
631 # point to what humans would consider to be the 'next' revision
632 # number. For example, 1.1.2.1's RCS 'next' would be 1.1.2.2.
634 # In other words, in RCS, 'next' always means "where to find the next
635 # deltatext that you need this revision to retrieve.
637 # That said, we don't *want* RCS's behavior here, so we determine
638 # whether we're on trunk or a branch and set the dependencies
639 # accordingly.
640 if next:
641 if is_trunk_revision(revision):
642 self._primary_dependencies.append( (next, revision,) )
643 else:
644 self._primary_dependencies.append( (revision, next,) )
646 def tree_completed(self):
647 """The revision tree has been parsed.
649 Analyze it for consistency and connect some loose ends.
651 This is a callback method declared in Sink."""
653 self._resolve_primary_dependencies()
654 self._resolve_branch_dependencies()
655 self._sort_branches()
656 self._resolve_tag_dependencies()
658 # Compute the preliminary CVSFileItems for this file:
659 cvs_items = []
660 cvs_items.extend(self._get_cvs_revisions())
661 cvs_items.extend(self._get_cvs_branches())
662 cvs_items.extend(self._get_cvs_tags())
663 self._cvs_file_items = CVSFileItems(
664 self.cvs_file, self.pdc.trunk, cvs_items
667 self._cvs_file_items.check_link_consistency()
669 def _resolve_primary_dependencies(self):
670 """Resolve the dependencies listed in self._primary_dependencies."""
672 for (parent, child,) in self._primary_dependencies:
673 parent_data = self._rev_data[parent]
674 assert parent_data.child is None
675 parent_data.child = child
677 child_data = self._rev_data[child]
678 assert child_data.parent is None
679 child_data.parent = parent
681 def _resolve_branch_dependencies(self):
682 """Resolve dependencies involving branches."""
684 for branch_data in self.sdc.branches_data.values():
685 # The branch_data's parent has the branch as a child regardless
686 # of whether the branch had any subsequent commits:
687 try:
688 parent_data = self._rev_data[branch_data.parent]
689 except KeyError:
690 Log().warn(
691 'In %r:\n'
692 ' branch %r references non-existing revision %s\n'
693 ' and will be ignored.'
694 % (self.cvs_file.filename, branch_data.symbol.name,
695 branch_data.parent,))
696 del self.sdc.branches_data[branch_data.branch_number]
697 else:
698 parent_data.branches_data.append(branch_data)
700 # If the branch has a child (i.e., something was committed on
701 # the branch), then we store a reference to the branch_data
702 # there, define the child's parent to be the branch's parent,
703 # and list the child in the branch parent's branches_revs_data:
704 if branch_data.child is not None:
705 child_data = self._rev_data[branch_data.child]
706 assert child_data.parent_branch_data is None
707 child_data.parent_branch_data = branch_data
708 assert child_data.parent is None
709 child_data.parent = branch_data.parent
710 parent_data.branches_revs_data.append(branch_data.child)
712 def _sort_branches(self):
713 """Sort the branches sprouting from each revision in creation order.
715 Creation order is taken to be the reverse of the order that they
716 are listed in the symbols part of the RCS file. (If a branch is
717 created then deleted, a later branch can be assigned the recycled
718 branch number; therefore branch numbers are not an indication of
719 creation order.)"""
721 for rev_data in self._rev_data.values():
722 rev_data.branches_data.sort(lambda a, b: - cmp(a.id, b.id))
724 def _resolve_tag_dependencies(self):
725 """Resolve dependencies involving tags."""
727 for (rev, tag_data_list) in self.sdc.tags_data.items():
728 try:
729 parent_data = self._rev_data[rev]
730 except KeyError:
731 Log().warn(
732 'In %r:\n'
733 ' the following tag(s) reference non-existing revision %s\n'
734 ' and will be ignored:\n'
735 ' %s' % (
736 self.cvs_file.filename, rev,
737 ', '.join([repr(tag_data.symbol.name)
738 for tag_data in tag_data_list]),))
739 del self.sdc.tags_data[rev]
740 else:
741 for tag_data in tag_data_list:
742 assert tag_data.rev == rev
743 # The tag_data's rev has the tag as a child:
744 parent_data.tags_data.append(tag_data)
746 def _get_cvs_branches(self):
747 """Generate the CVSBranches present in this file."""
749 for branch_data in self.sdc.branches_data.values():
750 yield CVSBranch(
751 branch_data.id, self.cvs_file, branch_data.symbol,
752 branch_data.branch_number,
753 self.sdc.rev_to_lod(branch_data.parent),
754 self._get_rev_id(branch_data.parent),
755 self._get_rev_id(branch_data.child),
756 None,
759 def _get_cvs_tags(self):
760 """Generate the CVSTags present in this file."""
762 for tags_data in self.sdc.tags_data.values():
763 for tag_data in tags_data:
764 yield CVSTag(
765 tag_data.id, self.cvs_file, tag_data.symbol,
766 self.sdc.rev_to_lod(tag_data.rev),
767 self._get_rev_id(tag_data.rev),
768 None,
771 def set_description(self, description):
772 """This is a callback method declared in Sink."""
774 self.cvs_file.description = description
775 self.cvs_file.determine_file_properties(Ctx().file_property_setters)
777 def set_revision_info(self, revision, log, text):
778 """This is a callback method declared in Sink."""
780 rev_data = self._rev_data[revision]
781 cvs_rev = self._cvs_file_items[rev_data.cvs_rev_id]
783 if cvs_rev.metadata_id is not None:
784 # Users have reported problems with repositories in which the
785 # deltatext block for revision 1.1 appears twice. It is not
786 # known whether this results from a CVS/RCS bug, or from botched
787 # hand-editing of the repository. In any case, empirically, cvs
788 # and rcs both use the first version when checking out data, so
789 # that's what we will do. (For the record: "cvs log" fails on
790 # such a file; "rlog" prints the log message from the first
791 # block and ignores the second one.)
792 Log().warn(
793 "%s: in '%s':\n"
794 " Deltatext block for revision %s appeared twice;\n"
795 " ignoring the second occurrence.\n"
796 % (warning_prefix, self.cvs_file.filename, revision,)
798 return
800 if is_trunk_revision(revision):
801 branch_name = None
802 else:
803 branch_name = self.sdc.rev_to_branch_data(revision).symbol.name
805 cvs_rev.metadata_id = self.collect_data.metadata_logger.store(
806 self.project, branch_name, rev_data.author, log
808 cvs_rev.deltatext_exists = bool(text)
810 # If this is revision 1.1, determine whether the file appears to
811 # have been created via 'cvs add' instead of 'cvs import'. The
812 # test is that the log message CVS uses for 1.1 in imports is
813 # "Initial revision\n" with no period. (This fact helps determine
814 # whether this file might have had a default branch in the past.)
815 if revision == '1.1':
816 self._file_imported = (log == 'Initial revision\n')
818 def parse_completed(self):
819 """Finish the processing of this file.
821 This is a callback method declared in Sink."""
823 # Make sure that there was an info section for each revision:
824 for cvs_item in self._cvs_file_items.values():
825 if isinstance(cvs_item, CVSRevision) and cvs_item.metadata_id is None:
826 self.collect_data.record_fatal_error(
827 '%r has no deltatext section for revision %s'
828 % (self.cvs_file.filename, cvs_item.rev,)
831 def _determine_operation(self, rev_data):
832 prev_rev_data = self._rev_data.get(rev_data.parent)
833 return cvs_revision_type_map[(
834 rev_data.state != 'dead',
835 prev_rev_data is not None and prev_rev_data.state != 'dead',
838 def _get_cvs_revisions(self):
839 """Generate the CVSRevisions present in this file."""
841 for rev_data in self._rev_data.itervalues():
842 yield self._get_cvs_revision(rev_data)
844 def _get_cvs_revision(self, rev_data):
845 """Create and return a CVSRevision for REV_DATA."""
847 branch_ids = [
848 branch_data.id
849 for branch_data in rev_data.branches_data
852 branch_commit_ids = [
853 self._get_rev_id(rev)
854 for rev in rev_data.branches_revs_data
857 tag_ids = [
858 tag_data.id
859 for tag_data in rev_data.tags_data
862 revision_type = self._determine_operation(rev_data)
864 return revision_type(
865 self._get_rev_id(rev_data.rev), self.cvs_file,
866 rev_data.timestamp, None,
867 self._get_rev_id(rev_data.parent),
868 self._get_rev_id(rev_data.child),
869 rev_data.rev,
870 True,
871 self.sdc.rev_to_lod(rev_data.rev),
872 rev_data.get_first_on_branch_id(),
873 False, None, None,
874 tag_ids, branch_ids, branch_commit_ids,
875 rev_data.revision_reader_token
878 def get_cvs_file_items(self):
879 """Finish up and return a CVSFileItems instance for this file.
881 This method must only be called once."""
883 self._process_ntdbrs()
885 # Break a circular reference loop, allowing the memory for self
886 # and sdc to be freed.
887 del self.sdc
889 return self._cvs_file_items
891 def _process_ntdbrs(self):
892 """Fix up any non-trunk default branch revisions (if present).
894 If a non-trunk default branch is determined to have existed, yield
895 the _RevisionData.ids for all revisions that were once non-trunk
896 default revisions, in dependency order.
898 There are two cases to handle:
900 One case is simple. The RCS file lists a default branch
901 explicitly in its header, such as '1.1.1'. In this case, we know
902 that every revision on the vendor branch is to be treated as head
903 of trunk at that point in time.
905 But there's also a degenerate case. The RCS file does not
906 currently have a default branch, yet we can deduce that for some
907 period in the past it probably *did* have one. For example, the
908 file has vendor revisions 1.1.1.1 -> 1.1.1.96, all of which are
909 dated before 1.2, and then it has 1.1.1.97 -> 1.1.1.100 dated
910 after 1.2. In this case, we should record 1.1.1.96 as the last
911 vendor revision to have been the head of the default branch.
913 If any non-trunk default branch revisions are found:
915 - Set their ntdbr members to True.
917 - Connect the last one with revision 1.2.
919 - Remove revision 1.1 if it is not needed.
923 try:
924 if self.default_branch:
925 vendor_cvs_branch_id = self.sdc.branches_data[self.default_branch].id
926 vendor_lod_items = self._cvs_file_items.get_lod_items(
927 self._cvs_file_items[vendor_cvs_branch_id]
929 if not self._cvs_file_items.process_live_ntdb(vendor_lod_items):
930 return
931 elif self._file_imported:
932 vendor_branch_data = self.sdc.branches_data.get('1.1.1')
933 if vendor_branch_data is None:
934 return
935 else:
936 vendor_lod_items = self._cvs_file_items.get_lod_items(
937 self._cvs_file_items[vendor_branch_data.id]
939 if not self._cvs_file_items.process_historical_ntdb(
940 vendor_lod_items
942 return
943 else:
944 return
945 except VendorBranchError, e:
946 self.collect_data.record_fatal_error(str(e))
947 return
949 if self._file_imported:
950 self._cvs_file_items.imported_remove_1_1(vendor_lod_items)
952 self._cvs_file_items.check_link_consistency()
955 class _ProjectDataCollector:
956 def __init__(self, collect_data, project):
957 self.collect_data = collect_data
958 self.project = project
959 self.num_files = 0
961 # The Trunk LineOfDevelopment object for this project:
962 self.trunk = Trunk(
963 self.collect_data.symbol_key_generator.gen_id(), self.project
965 self.project.trunk_id = self.trunk.id
967 # This causes a record for self.trunk to spring into existence:
968 self.collect_data.register_trunk(self.trunk)
970 # A map { name -> Symbol } for all known symbols in this project.
971 # The symbols listed here are undifferentiated into Branches and
972 # Tags because the same name might appear as a branch in one file
973 # and a tag in another.
974 self.symbols = {}
976 # A map { (old_name, new_name) : count } indicating how many files
977 # were affected by each each symbol name transformation:
978 self.symbol_transform_counts = {}
980 def get_symbol(self, name):
981 """Return the Symbol object for the symbol named NAME in this project.
983 If such a symbol does not yet exist, allocate a new symbol_id,
984 create a Symbol instance, store it in self.symbols, and return it."""
986 symbol = self.symbols.get(name)
987 if symbol is None:
988 symbol = Symbol(
989 self.collect_data.symbol_key_generator.gen_id(),
990 self.project, name)
991 self.symbols[name] = symbol
992 return symbol
994 def log_symbol_transform(self, old_name, new_name):
995 """Record that OLD_NAME was transformed to NEW_NAME in one file.
997 This information is used to generated a statistical summary of
998 symbol transforms."""
1000 try:
1001 self.symbol_transform_counts[old_name, new_name] += 1
1002 except KeyError:
1003 self.symbol_transform_counts[old_name, new_name] = 1
1005 def summarize_symbol_transforms(self):
1006 if self.symbol_transform_counts and Log().is_on(Log.NORMAL):
1007 log = Log()
1008 log.normal('Summary of symbol transforms:')
1009 transforms = self.symbol_transform_counts.items()
1010 transforms.sort()
1011 for ((old_name, new_name), count) in transforms:
1012 if new_name is None:
1013 log.normal(' "%s" ignored in %d files' % (old_name, count,))
1014 else:
1015 log.normal(
1016 ' "%s" transformed to "%s" in %d files'
1017 % (old_name, new_name, count,)
1020 def process_file(self, cvs_file):
1021 Log().normal(cvs_file.filename)
1022 fdc = _FileDataCollector(self, cvs_file)
1023 try:
1024 cvs2svn_rcsparse.parse(open(cvs_file.filename, 'rb'), fdc)
1025 except (cvs2svn_rcsparse.common.RCSParseError, ValueError, RuntimeError):
1026 self.collect_data.record_fatal_error(
1027 "%r is not a valid ,v file" % (cvs_file.filename,)
1029 # Abort the processing of this file, but let the pass continue
1030 # with other files:
1031 return
1032 except:
1033 Log().warn("Exception occurred while parsing %s" % cvs_file.filename)
1034 raise
1035 else:
1036 self.num_files += 1
1038 return fdc.get_cvs_file_items()
1041 class CollectData:
1042 """Repository for data collected by parsing the CVS repository files.
1044 This class manages the databases into which information collected
1045 from the CVS repository is stored. The data are stored into this
1046 class by _FileDataCollector instances, one of which is created for
1047 each file to be parsed."""
1049 def __init__(self, stats_keeper):
1050 self._cvs_item_store = NewCVSItemStore(
1051 artifact_manager.get_temp_file(config.CVS_ITEMS_STORE))
1052 self.metadata_db = MetadataDatabase(
1053 artifact_manager.get_temp_file(config.METADATA_STORE),
1054 artifact_manager.get_temp_file(config.METADATA_INDEX_TABLE),
1055 DB_OPEN_NEW,
1057 self.metadata_logger = MetadataLogger(self.metadata_db)
1058 self.fatal_errors = []
1059 self.num_files = 0
1060 self.symbol_stats = SymbolStatisticsCollector()
1061 self.stats_keeper = stats_keeper
1063 # Key generator for CVSFiles:
1064 self.file_key_generator = KeyGenerator()
1066 # Key generator for CVSItems:
1067 self.item_key_generator = KeyGenerator()
1069 # Key generator for Symbols:
1070 self.symbol_key_generator = KeyGenerator()
1072 def record_fatal_error(self, err):
1073 """Record that fatal error ERR was found.
1075 ERR is a string (without trailing newline) describing the error.
1076 Output the error to stderr immediately, and record a copy to be
1077 output again in a summary at the end of CollectRevsPass."""
1079 err = '%s: %s' % (error_prefix, err,)
1080 Log().error(err + '\n')
1081 self.fatal_errors.append(err)
1083 def add_cvs_directory(self, cvs_directory):
1084 """Record CVS_DIRECTORY."""
1086 Ctx()._cvs_path_db.log_path(cvs_directory)
1088 def add_cvs_file_items(self, cvs_file_items):
1089 """Record the information from CVS_FILE_ITEMS.
1091 Store the CVSFile to _cvs_path_db under its persistent id, store
1092 the CVSItems, and record the CVSItems to self.stats_keeper."""
1094 Ctx()._cvs_path_db.log_path(cvs_file_items.cvs_file)
1095 self._cvs_item_store.add(cvs_file_items)
1097 self.stats_keeper.record_cvs_file(cvs_file_items.cvs_file)
1098 for cvs_item in cvs_file_items.values():
1099 self.stats_keeper.record_cvs_item(cvs_item)
1101 def register_trunk(self, trunk):
1102 """Create a symbol statistics record for the specified trunk LOD."""
1104 # This causes a record to spring into existence:
1105 self.symbol_stats[trunk]
1107 def _process_cvs_file_items(self, cvs_file_items):
1108 """Process the CVSFileItems from one CVSFile."""
1110 # Remove an initial delete on trunk if it is not needed:
1111 cvs_file_items.remove_unneeded_initial_trunk_delete(self.metadata_db)
1113 # Remove initial branch deletes that are not needed:
1114 cvs_file_items.remove_initial_branch_deletes(self.metadata_db)
1116 # If this is a --trunk-only conversion, discard all branches and
1117 # tags, then draft any non-trunk default branch revisions to
1118 # trunk:
1119 if Ctx().trunk_only:
1120 cvs_file_items.exclude_non_trunk()
1122 cvs_file_items.check_link_consistency()
1124 self.add_cvs_file_items(cvs_file_items)
1125 self.symbol_stats.register(cvs_file_items)
1127 def process_project(self, project):
1128 Ctx()._projects[project.id] = project
1130 pdc = _ProjectDataCollector(self, project)
1132 found_rcs_file = False
1133 for cvs_path in walk_repository(
1134 project, self.file_key_generator, self.record_fatal_error
1136 if isinstance(cvs_path, CVSDirectory):
1137 self.add_cvs_directory(cvs_path)
1138 else:
1139 cvs_file_items = pdc.process_file(cvs_path)
1140 self._process_cvs_file_items(cvs_file_items)
1141 found_rcs_file = True
1143 if not found_rcs_file:
1144 self.record_fatal_error(
1145 'No RCS files found under %r!\n'
1146 'Are you absolutely certain you are pointing cvs2svn\n'
1147 'at a CVS repository?\n'
1148 % (project.project_cvs_repos_path,)
1151 pdc.summarize_symbol_transforms()
1153 self.num_files += pdc.num_files
1154 Log().verbose('Processed', self.num_files, 'files')
1156 def _register_empty_subdirectories(self):
1157 """Set the CVSDirectory.empty_subdirectory_id members."""
1159 directories = set(
1160 path
1161 for path in Ctx()._cvs_path_db.itervalues()
1162 if isinstance(path, CVSDirectory)
1164 for path in Ctx()._cvs_path_db.itervalues():
1165 if isinstance(path, CVSFile):
1166 directory = path.parent_directory
1167 while directory is not None and directory in directories:
1168 directories.remove(directory)
1169 directory = directory.parent_directory
1170 for directory in directories:
1171 if directory.parent_directory is not None:
1172 directory.parent_directory.empty_subdirectory_ids.append(directory.id)
1174 def _set_cvs_path_ordinals(self):
1175 cvs_files = list(Ctx()._cvs_path_db.itervalues())
1176 cvs_files.sort(CVSPath.slow_compare)
1177 for (i, cvs_file) in enumerate(cvs_files):
1178 cvs_file.ordinal = i
1180 def close(self):
1181 """Close the data structures associated with this instance.
1183 Return a list of fatal errors encountered while processing input.
1184 Each list entry is a string describing one fatal error."""
1186 self.symbol_stats.purge_ghost_symbols()
1187 self.symbol_stats.close()
1188 self.symbol_stats = None
1189 self.metadata_logger = None
1190 self.metadata_db.close()
1191 self.metadata_db = None
1192 self._cvs_item_store.close()
1193 self._cvs_item_store = None
1194 self._register_empty_subdirectories()
1195 self._set_cvs_path_ordinals()
1196 retval = self.fatal_errors
1197 self.fatal_errors = None
1198 return retval