cvs2svn_lib/changeset_graph.py

   1 # (Be in -*- python -*- mode.)
   2 #
   3 # ====================================================================
   4 # Copyright (c) 2006-2008 CollabNet.  All rights reserved.
   5 #
   6 # This software is licensed as described in the file COPYING, which
   7 # you should have received as part of this distribution.  The terms
   8 # are also available at http://subversion.tigris.org/license-1.html.
   9 # If newer versions of this license are posted there, you may use a
  10 # newer version instead, at your option.
  11 #
  12 # This software consists of voluntary contributions made by many
  13 # individuals.  For exact contribution history, see the revision
  14 # history and logs, available at http://cvs2svn.tigris.org/.
  15 # ====================================================================
  16
  17 """The changeset dependency graph."""
  18
  19
  20 from cvs2svn_lib.log import Log
  21 from cvs2svn_lib.changeset import RevisionChangeset
  22 from cvs2svn_lib.changeset import OrderedChangeset
  23 from cvs2svn_lib.changeset import BranchChangeset
  24 from cvs2svn_lib.changeset import TagChangeset
  25
  26
  27 class CycleInGraphException(Exception):
  28   def __init__(self, cycle):
  29     Exception.__init__(
  30         self,
  31         'Cycle found in graph: %s'
  32         % ' -> '.join(map(str, cycle + [cycle[0]])))
  33
  34
  35 class NoPredNodeInGraphException(Exception):
  36   def __init__(self, node):
  37     Exception.__init__(self, 'Node %s has no predecessors' % (node,))
  38
  39
  40 class _NoPredNodes:
  41   """Manage changesets that are to be processed.
  42
  43   Output the changesets in order by time and changeset type.
  44
  45   The implementation of this class is crude: as changesets are added,
  46   they are appended to a list.  When one is needed, the list is sorted
  47   in reverse order and then the last changeset in the list is
  48   returned.  To reduce the number of sorts that are needed, the class
  49   keeps track of whether the list is currently sorted.
  50
  51   All this repeated sorting is wasteful and unnecessary.  We should
  52   instead use a heap to output the changeset order, which would
  53   require O(lg N) work per add()/get() rather than O(1) and O(N lg N)
  54   as in the current implementation [1].  But: (1) the lame interface
  55   of heapq doesn't allow an arbitrary compare function, so we would
  56   have to store extra information in the array elements; (2) in
  57   practice, the number of items in the list at any time is only a tiny
  58   fraction of the total number of changesets; and (3) testing showed
  59   that the heapq implementation is no faster than this one (perhaps
  60   because of the increased memory usage).
  61
  62   [1] According to Objects/listsort.txt in the Python source code, the
  63   Python list-sorting code is heavily optimized for arrays that have
  64   runs of already-sorted elements, so the current cost of get() is
  65   probably closer to O(N) than O(N lg N)."""
  66
  67   def __init__(self, changeset_db):
  68     self.changeset_db = changeset_db
  69     # A list [(node, changeset,)] of nodes with no predecessors:
  70     self._nodes = []
  71     self._sorted = True
  72
  73   def __len__(self):
  74     return len(self._nodes)
  75
  76   @staticmethod
  77   def _compare((node_1, changeset_1), (node_2, changeset_2)):
  78     """Define a (reverse) ordering on self._nodes."""
  79
  80     return cmp(node_2.time_range, node_1.time_range) \
  81            or cmp(changeset_2, changeset_1)
  82
  83   def add(self, node):
  84     self._nodes.append( (node, self.changeset_db[node.id],) )
  85     self._sorted = False
  86
  87   def get(self):
  88     """Return (node, changeset,) of the smallest node.
  89
  90     'Smallest' is defined by self._compare()."""
  91
  92     if not self._sorted:
  93       self._nodes.sort(self._compare)
  94       self._sorted = True
  95     return self._nodes.pop()
  96
  97
  98 class ChangesetGraph(object):
  99   """A graph of changesets and their dependencies."""
 100
 101   def __init__(self, changeset_db, cvs_item_to_changeset_id):
 102     self._changeset_db = changeset_db
 103     self._cvs_item_to_changeset_id = cvs_item_to_changeset_id
 104     # A map { id : ChangesetGraphNode }
 105     self.nodes = {}
 106
 107   def close(self):
 108     self._cvs_item_to_changeset_id.close()
 109     self._cvs_item_to_changeset_id = None
 110     self._changeset_db.close()
 111     self._changeset_db = None
 112
 113   def add_changeset(self, changeset):
 114     """Add CHANGESET to this graph.
 115
 116     Determine and record any dependencies to changesets that are
 117     already in the graph.  This method does not affect the databases."""
 118
 119     node = changeset.create_graph_node(self._cvs_item_to_changeset_id)
 120
 121     # Now tie the node into our graph.  If a changeset referenced by
 122     # node is already in our graph, then add the backwards connection
 123     # from the other node to the new one.  If not, then delete the
 124     # changeset from node.
 125
 126     for pred_id in list(node.pred_ids):
 127       pred_node = self.nodes.get(pred_id)
 128       if pred_node is not None:
 129         pred_node.succ_ids.add(node.id)
 130       else:
 131         node.pred_ids.remove(pred_id)
 132
 133     for succ_id in list(node.succ_ids):
 134       succ_node = self.nodes.get(succ_id)
 135       if succ_node is not None:
 136         succ_node.pred_ids.add(node.id)
 137       else:
 138         node.succ_ids.remove(succ_id)
 139
 140     self.nodes[node.id] = node
 141
 142   def store_changeset(self, changeset):
 143     for cvs_item_id in changeset.cvs_item_ids:
 144       self._cvs_item_to_changeset_id[cvs_item_id] = changeset.id
 145     self._changeset_db.store(changeset)
 146
 147   def add_new_changeset(self, changeset):
 148     """Add the new CHANGESET to the graph and also to the databases."""
 149
 150     if Log().is_on(Log.DEBUG):
 151       Log().debug('Adding changeset %r' % (changeset,))
 152
 153     self.add_changeset(changeset)
 154     self.store_changeset(changeset)
 155
 156   def delete_changeset(self, changeset):
 157     """Remove CHANGESET from the graph and also from the databases.
 158
 159     In fact, we don't remove CHANGESET from
 160     self._cvs_item_to_changeset_id, because in practice the CVSItems
 161     in CHANGESET are always added again as part of a new CHANGESET,
 162     which will cause the old values to be overwritten."""
 163
 164     if Log().is_on(Log.DEBUG):
 165       Log().debug('Removing changeset %r' % (changeset,))
 166
 167     del self[changeset.id]
 168     del self._changeset_db[changeset.id]
 169
 170   def __nonzero__(self):
 171     """Instances are considered True iff they contain any nodes."""
 172
 173     return bool(self.nodes)
 174
 175   def __contains__(self, id):
 176     """Return True if the specified ID is contained in this graph."""
 177
 178     return id in self.nodes
 179
 180   def __getitem__(self, id):
 181     return self.nodes[id]
 182
 183   def get(self, id):
 184     return self.nodes.get(id)
 185
 186   def __delitem__(self, id):
 187     """Remove the node corresponding to ID.
 188
 189     Also remove references to it from other nodes.  This method does
 190     not change pred_ids or succ_ids of the node being deleted, nor
 191     does it affect the databases."""
 192
 193     node = self[id]
 194
 195     for succ_id in node.succ_ids:
 196       succ = self[succ_id]
 197       succ.pred_ids.remove(node.id)
 198
 199     for pred_id in node.pred_ids:
 200       pred = self[pred_id]
 201       pred.succ_ids.remove(node.id)
 202
 203     del self.nodes[node.id]
 204
 205   def keys(self):
 206     return self.nodes.keys()
 207
 208   def __iter__(self):
 209     return self.nodes.itervalues()
 210
 211   def _get_path(self, reachable_changesets, starting_node_id, ending_node_id):
 212     """Return the shortest path from ENDING_NODE_ID to STARTING_NODE_ID.
 213
 214     Find a path from ENDING_NODE_ID to STARTING_NODE_ID in
 215     REACHABLE_CHANGESETS, where STARTING_NODE_ID is the id of a
 216     changeset that depends on the changeset with ENDING_NODE_ID.  (See
 217     the comment in search_for_path() for a description of the format
 218     of REACHABLE_CHANGESETS.)
 219
 220     Return a list of changesets, where the 0th one has ENDING_NODE_ID
 221     and the last one has STARTING_NODE_ID.  If there is no such path
 222     described in in REACHABLE_CHANGESETS, return None."""
 223
 224     if ending_node_id not in reachable_changesets:
 225       return None
 226
 227     path = [self._changeset_db[ending_node_id]]
 228     id = reachable_changesets[ending_node_id][1]
 229     while id != starting_node_id:
 230       path.append(self._changeset_db[id])
 231       id = reachable_changesets[id][1]
 232     path.append(self._changeset_db[starting_node_id])
 233     return path
 234
 235   def search_for_path(self, starting_node_id, stop_set):
 236     """Search for paths to prerequisites of STARTING_NODE_ID.
 237
 238     Try to find the shortest dependency path that causes the changeset
 239     with STARTING_NODE_ID to depend (directly or indirectly) on one of
 240     the changesets whose ids are contained in STOP_SET.
 241
 242     We consider direct and indirect dependencies in the sense that the
 243     changeset can be reached by following a chain of predecessor nodes.
 244
 245     When one of the changeset_ids in STOP_SET is found, terminate the
 246     search and return the path from that changeset_id to
 247     STARTING_NODE_ID.  If no path is found to a node in STOP_SET,
 248     return None."""
 249
 250     # A map {node_id : (steps, next_node_id)} where NODE_ID can be
 251     # reached from STARTING_NODE_ID in STEPS steps, and NEXT_NODE_ID
 252     # is the id of the previous node in the path.  STARTING_NODE_ID is
 253     # only included as a key if there is a loop leading back to it.
 254     reachable_changesets = {}
 255
 256     # A list of (node_id, steps) that still have to be investigated,
 257     # and STEPS is the number of steps to get to NODE_ID.
 258     open_nodes = [(starting_node_id, 0)]
 259     # A breadth-first search:
 260     while open_nodes:
 261       (id, steps) = open_nodes.pop(0)
 262       steps += 1
 263       node = self[id]
 264       for pred_id in node.pred_ids:
 265         # Since the search is breadth-first, we only have to set steps
 266         # that don't already exist.
 267         if pred_id not in reachable_changesets:
 268           reachable_changesets[pred_id] = (steps, id)
 269           open_nodes.append((pred_id, steps))
 270
 271           # See if we can stop now:
 272           if pred_id in stop_set:
 273             return self._get_path(
 274                 reachable_changesets, starting_node_id, pred_id
 275                 )
 276
 277     return None
 278
 279   def consume_nopred_nodes(self):
 280     """Remove and yield changesets in dependency order.
 281
 282     Each iteration, this generator yields a (changeset, time_range)
 283     tuple for the oldest changeset in the graph that doesn't have any
 284     predecessor nodes (i.e., it is ready to be committed).  This is
 285     continued until there are no more nodes without predecessors
 286     (either because the graph has been emptied, or because of cycles
 287     in the graph).
 288
 289     Among the changesets that are ready to be processed, the earliest
 290     one (according to the sorting of the TimeRange class) is yielded
 291     each time.  (This is the order in which the changesets should be
 292     committed.)
 293
 294     The graph should not be otherwise altered while this generator is
 295     running."""
 296
 297     # Find a list of (node,changeset,) where the node has no
 298     # predecessors:
 299     nopred_nodes = _NoPredNodes(self._changeset_db)
 300     for node in self.nodes.itervalues():
 301       if not node.pred_ids:
 302         nopred_nodes.add(node)
 303
 304     while nopred_nodes:
 305       (node, changeset,) = nopred_nodes.get()
 306       del self[node.id]
 307       # See if any successors are now ready for extraction:
 308       for succ_id in node.succ_ids:
 309         succ = self[succ_id]
 310         if not succ.pred_ids:
 311           nopred_nodes.add(succ)
 312       yield (changeset, node.time_range)
 313
 314   def find_cycle(self, starting_node_id):
 315     """Find a cycle in the dependency graph and return it.
 316
 317     Use STARTING_NODE_ID as the place to start looking.  This routine
 318     must only be called after all nopred_nodes have been removed.
 319     Return the list of changesets that are involved in the cycle
 320     (ordered such that cycle[n-1] is a predecessor of cycle[n] and
 321     cycle[-1] is a predecessor of cycle[0])."""
 322
 323     # Since there are no nopred nodes in the graph, all nodes in the
 324     # graph must either be involved in a cycle or depend (directly or
 325     # indirectly) on nodes that are in a cycle.
 326
 327     # Pick an arbitrary node:
 328     node = self[starting_node_id]
 329
 330     seen_nodes = [node]
 331
 332     # Follow it backwards until a node is seen a second time; then we
 333     # have our cycle.
 334     while True:
 335       # Pick an arbitrary predecessor of node.  It must exist, because
 336       # there are no nopred nodes:
 337       try:
 338         node_id = node.pred_ids.__iter__().next()
 339       except StopIteration:
 340         raise NoPredNodeInGraphException(node)
 341       node = self[node_id]
 342       try:
 343         i = seen_nodes.index(node)
 344       except ValueError:
 345         seen_nodes.append(node)
 346       else:
 347         seen_nodes = seen_nodes[i:]
 348         seen_nodes.reverse()
 349         return [self._changeset_db[node.id] for node in seen_nodes]
 350
 351   def consume_graph(self, cycle_breaker=None):
 352     """Remove and yield changesets from this graph in dependency order.
 353
 354     Each iteration, this generator yields a (changeset, time_range)
 355     tuple for the oldest changeset in the graph that doesn't have any
 356     predecessor nodes.  If CYCLE_BREAKER is specified, then call
 357     CYCLE_BREAKER(cycle) whenever a cycle is encountered, where cycle
 358     is the list of changesets that are involved in the cycle (ordered
 359     such that cycle[n-1] is a predecessor of cycle[n] and cycle[-1] is
 360     a predecessor of cycle[0]).  CYCLE_BREAKER should break the cycle
 361     in place then return.
 362
 363     If a cycle is found and CYCLE_BREAKER was not specified, raise
 364     CycleInGraphException."""
 365
 366     while True:
 367       for (changeset, time_range) in self.consume_nopred_nodes():
 368         yield (changeset, time_range)
 369
 370       # If there are any nodes left in the graph, then there must be
 371       # at least one cycle.  Find a cycle and process it.
 372
 373       # This might raise StopIteration, but that indicates that the
 374       # graph has been fully consumed, so we just let the exception
 375       # escape.
 376       start_node_id = self.nodes.iterkeys().next()
 377
 378       cycle = self.find_cycle(start_node_id)
 379
 380       if cycle_breaker is not None:
 381         cycle_breaker(cycle)
 382       else:
 383         raise CycleInGraphException(cycle)
 384
 385   def __repr__(self):
 386     """For convenience only.  The format is subject to change at any time."""
 387
 388     if self.nodes:
 389       return 'ChangesetGraph:\n%s' \
 390              % ''.join(['  %r\n' % node for node in self])
 391     else:
 392       return 'ChangesetGraph:\n  EMPTY\n'
 393
 394   node_colors = {
 395       RevisionChangeset : 'lightgreen',
 396       OrderedChangeset : 'cyan',
 397       BranchChangeset : 'orange',
 398       TagChangeset : 'yellow',
 399       }
 400
 401   def output_coarse_dot(self, f):
 402     """Output the graph in DOT format to file-like object f.
 403
 404     Such a file can be rendered into a visual representation of the
 405     graph using tools like graphviz.  Include only changesets in the
 406     graph, and the dependencies between changesets."""
 407
 408     f.write('digraph G {\n')
 409     for node in self:
 410       f.write(
 411           '  C%x [style=filled, fillcolor=%s];\n' % (
 412               node.id,
 413               self.node_colors[self._changeset_db[node.id].__class__],
 414               )
 415           )
 416     f.write('\n')
 417
 418     for node in self:
 419       for succ_id in node.succ_ids:
 420         f.write('  C%x -> C%x\n' % (node.id, succ_id,))
 421       f.write('\n')
 422
 423     f.write('}\n')
 424
 425   def output_fine_dot(self, f):
 426     """Output the graph in DOT format to file-like object f.
 427
 428     Such a file can be rendered into a visual representation of the
 429     graph using tools like graphviz.  Include all CVSItems and the
 430     CVSItem-CVSItem dependencies in the graph.  Group the CVSItems
 431     into clusters by changeset."""
 432
 433     f.write('digraph G {\n')
 434     for node in self:
 435       f.write('  subgraph cluster_%x {\n' % (node.id,))
 436       f.write('    label = "C%x";\n' % (node.id,))
 437       changeset = self._changeset_db[node.id]
 438       for item_id in changeset.cvs_item_ids:
 439         f.write('    I%x;\n' % (item_id,))
 440       f.write('    style=filled;\n')
 441       f.write(
 442           '    fillcolor=%s;\n'
 443           % (self.node_colors[self._changeset_db[node.id].__class__],))
 444       f.write('  }\n\n')
 445
 446     for node in self:
 447       changeset = self._changeset_db[node.id]
 448       for cvs_item in changeset.iter_cvs_items():
 449         for succ_id in cvs_item.get_succ_ids():
 450           f.write('  I%x -> I%x;\n' % (cvs_item.id, succ_id,))
 451
 452       f.write('\n')
 453
 454     f.write('}\n')
 455
 456