Parent classes are considered for up propagating candidates.
[docutils/kirr.git] / sandbox / rstdiff / rstdiff.py
blob7233a452f7d06e63189edd4fe46e55ec0db03ef7
1 #!/usr/bin/env python
3 # Copyright (C) 2010 Stefan Merten
5 # rstdiff.py is free software; you can redistribute it and/or modify
6 # it under the terms of the GNU General Public License as published
7 # by the Free Software Foundation; either version 2 of the License,
8 # or (at your option) any later version.
10 # This program is distributed in the hope that it will be useful,
11 # but WITHOUT ANY WARRANTY; without even the implied warranty of
12 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 # General Public License for more details.
15 # You should have received a copy of the GNU General Public License
16 # along with this program; if not, write to the Free Software
17 # Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA
18 # 02111-1307, USA.
20 """
21 Generates a structural diff from two reStructuredText input documents
22 and produces an annotated result.
23 """
25 __docformat__ = 'reStructuredText'
27 try:
28 import locale
29 locale.setlocale(locale.LC_ALL, '')
30 except:
31 pass
33 import os, re, sys
35 import docutils
36 from docutils import frontend, writers, nodes, SettingsSpec
37 from docutils.core import Publisher
38 from docutils.utils import SystemMessage, Reporter, new_reporter, new_document
39 from docutils.frontend import OptionParser, make_paths_absolute
40 from docutils.transforms import Transform
42 from treediff import TreeMatcher, HashableNodeImpl
44 ###############################################################################
45 ###############################################################################
46 # Command line specification
48 description = ("""Generates a structural diff from two reStructuredText input
49 documents and produces an annotated result. """)
51 writerOption = 'writer'
52 writerDefault = 'xml'
53 writerArgRE1 = '^--' + writerOption + '=' + '(.*)$'
55 settings_spec = (
56 'rstdiff options',
57 None,
58 (('Select writer to write output with (default "xml").',
59 ['--' + writerOption],
60 {}),
64 settings_defaults = {'output_encoding_error_handler': 'xmlcharrefreplace',
65 writerOption: writerDefault}
67 config_section = 'rstdiff'
69 usage = '%prog [options]... <old> [<new> [<output>]]'
71 ###############################################################################
72 # Classes for three argument command lines
74 class Publisher3Args(Publisher):
76 def setup_option_parser(self, usage=None, description=None,
77 settings_spec=None, config_section=None,
78 **defaults):
79 if config_section:
80 if not settings_spec:
81 settings_spec = SettingsSpec()
82 settings_spec.config_section = config_section
83 parts = config_section.split()
84 if len(parts) > 1 and parts[-1] == 'application':
85 settings_spec.config_section_dependencies = ['applications']
86 #@@@ Add self.source & self.destination to components in future?
87 option_parser = OptionParser3Args(
88 components=(self.parser, self.reader, self.writer, settings_spec),
89 defaults=defaults, read_config_files=1,
90 usage=usage, description=description)
91 return option_parser
93 class OptionParser3Args(OptionParser):
95 def check_values(self, values, args):
96 """Store positional arguments as runtime settings."""
97 values._old_source, values._new_source, values._destination = self.check_args(args)
98 make_paths_absolute(values.__dict__, self.relative_path_settings,
99 os.getcwd())
100 values._config_files = self.config_files
101 return values
103 def check_args(self, args):
104 old_source = new_source = destination = None
105 if not args:
106 self.error('At least 1 argument required.')
107 else:
108 old_source = args.pop(0)
109 if old_source == '-': # means stdin
110 old_source = None
111 if args:
112 new_source = args.pop(0)
113 if new_source == '-': # means stdin
114 new_source = None
115 if args:
116 destination = args.pop(0)
117 if destination == '-': # means stdout
118 destination = None
119 if args:
120 self.error('Maximum 3 arguments allowed.')
121 if old_source is None and new_source is None:
122 self.error('Old and new source may not both use stdin.')
123 if (old_source and old_source == destination
124 or new_source and new_source == destination):
125 self.error('Do not specify the same file for both source and '
126 'destination. It will clobber the source file.')
127 return old_source, new_source, destination
129 ###############################################################################
130 ###############################################################################
131 # Helpers
133 class Opcode(object):
134 """Encapsulates opcodes as returned by `TreeMatcher.get_opcodes()`"""
136 Replace = 'replace'
137 Delete = 'delete'
138 Insert = 'insert'
139 Equal = 'equal'
140 Descend = 'descend'
142 _tuple = None
144 def __init__(self, opcodeTuple):
145 """Initialize from a tuple returned by `TreeMatcher.get_opcodes()`"""
146 self._tuple = list(opcodeTuple)
148 def getCommand(self):
149 """Return the command."""
150 return self._tuple[0]
152 def getOldRange(self):
153 """Returns the range pertaining to an old list."""
154 return ( self._tuple[1], self._tuple[2], )
156 def getNewRange(self):
157 """Returns the range pertaining to a new list."""
158 return ( self._tuple[3], self._tuple[4], )
160 def getSubOpcodes(self):
161 """Return the sub-opcodes in case of `command` == 'descend' or
162 `None`."""
163 if self._tuple[0] != self.Descend:
164 return None
165 return self._tuple[5]
167 def resolveOpcode(self, oldList, newList):
168 """Resolves opcode pertaining to `oldList` and `newList`. Returns tuple
169 consisting of
171 command
172 Same as self.getCommand().
174 oldRange
175 The range of elements in `oldList` affected by the opcode.
177 newRange
178 The range of elements in `newList` affected by the opcode.
180 subOpcodes
181 Same as self.getSubOpcodes().
183 oldRange = self.getOldRange()
184 newRange = self.getNewRange()
185 return ( self.getCommand(), oldList[oldRange[0]:oldRange[1]],
186 newList[newRange[0]:newRange[1]], self.getSubOpcodes())
188 def setSubOpcodes(self, opcodes):
189 """Set the sub-opcodes to a new list."""
190 if self._tuple[0] != self.Descend:
191 raise TypeError("Can not set subopcodes of a %r opcode"
192 % ( self._tuple[0], ))
193 self._tuple[5] = opcodes
195 def setCommand(self, command):
196 """Set a new command adapting subopcodes."""
197 if self._tuple[0] == command:
198 return
199 self._tuple[0] = command
200 if command == self.Descend:
201 self._tuple[5] = [ ]
202 else:
203 self._tuple = self._tuple[0:5]
205 def setOldRange(self, range):
206 """Sets the range pertaining to an old list."""
207 ( self._tuple[1], self._tuple[2], ) = range
209 def setNewRange(self, range):
210 """Sets the range pertaining to a new list."""
211 ( self._tuple[3], self._tuple[4], ) = range
213 def asTuple(self):
214 """Return the opcode as a tuple."""
215 return tuple(self._tuple)
217 ###############################################################################
218 ###############################################################################
219 # Additional docutils stuff
221 ###############################################################################
222 # Node types
224 class White(nodes.Text):
225 """A piece of text containing only whitespace."""
227 tagname = '#white'
229 """A regular expression matching strings for this class and returning
230 them as the first match."""
231 # TODO Could be subject to an option
232 re = '(\\s+)'
234 class Word(nodes.Text):
235 """A piece of text containing exactly one word."""
237 tagname = '#word'
239 @staticmethod
240 def splitText(text):
241 """Splits text and returns a sequence of `Word` and `White`
242 objects. Returns an empty sequence for an empty `text`."""
244 subs = re.split(White.re, text.astext())
245 result = [ ]
246 if not subs:
247 return result
248 elif re.match(White.re, subs[0]):
249 ( current, next, ) = ( White, Word, )
250 else:
251 ( current, next, ) = ( Word, White, )
252 for sub in subs:
253 result.append(current(sub))
254 ( current, next, ) = ( next, current, )
255 return result
257 ###############################################################################
258 # Transformers
260 class Text2Words(Transform):
261 """Transforms a `Text` node into a sequence of `Word`/`White`."""
263 def apply(self):
264 self.document.walk(Text2WordsVisitor(self.document))
266 class Text2WordsVisitor(nodes.SparseNodeVisitor):
268 def visit_Text(self, text):
269 words = Word.splitText(text)
270 if not words:
271 # An empty text
272 words = [ White(''), ]
273 text.parent.replace(text, words)
275 class Words2Text(Transform):
276 """Transforms a sequence of `Word`/`White` into a `Text` node."""
278 def apply(self):
279 self.document.walk(Words2TextVisitor(self.document))
281 class Words2TextVisitor(nodes.SparseNodeVisitor):
283 def visit_Text(self, text):
284 parent = text.parent
285 # Find this node and the first node of the sequence it belongs to
286 first = None
287 for i in range(len(parent)):
288 if not isinstance(parent[i], nodes.Text):
289 first = None
290 elif first is None:
291 first = i
292 # ``parent.index(text)`` uses value equality - can not be
293 # used here to find `text`
294 if id(parent[i]) == id(text):
295 end = i + 1
296 break
297 else:
298 raise IndexError("Can not find %r in its parent" % ( text, ))
300 if (len(parent) > end
301 and isinstance(parent[end], nodes.Text)):
302 # The visitor processes following children even if they are
303 # deleted - so work for last node of a sequence
304 return
306 texts = nodes.Text(reduce(lambda s, node: s + node.astext(),
307 parent[first:end], ""))
308 parent[first:end] = ( texts, )
310 visit_White = visit_Text
312 visit_Word = visit_Text
314 ###############################################################################
315 ###############################################################################
316 # Hashable
318 class DocutilsDispatcher(HashableNodeImpl):
319 """Implements hashable for a docutils `Node` and supports construction."""
321 debug = False
323 def __init__(self):
324 super(self.__class__, self).__init__(nodes.Node)
326 def dispatchClass(self, function, node, *args):
327 """Dispatch a call of type `function` for the class of `node` using
328 arguments `node` and `args`. Default is to dispatch for imaginary class
329 "UNKNOWN"."""
330 pat = "%s_%%s" % ( function, )
331 try:
332 name = pat % ( node.__class__.__name__, )
333 method = getattr(self, name)
334 except AttributeError:
335 name = pat % ( 'UNKNOWN', )
336 method = getattr(self, name)
337 if self.debug:
338 print("*** %s(%s)"
339 % ( name, ", ".join([ arg.__class__.__name__
340 for arg in ( node, ) + args ]), ))
341 for arg in ( node, ) + args:
342 print(" > %s" % ( arg, ))
343 result = method(node, *args)
344 if self.debug:
345 print(" < %s" % ( result, ))
346 return result
348 ###########################################################################
349 ###########################################################################
350 # Implementation of abstract methods for `HashableNodeImpl`
352 def rootHash(self, node):
353 """Return a hash for the root only. Subclasses must override
354 this."""
355 return self.dispatchClass('rootHash', node)
357 def rootHash_UNKNOWN(self, node):
358 return hash(node.__class__)
360 def rootEq(self, node, other):
361 """Returns root equality of `node` and an `other` node. ``True`` if
362 the two nodes as roots are equal without considering their
363 children. This should be true if one node can be replaced by
364 the other and all changes can be represented without changing
365 the node itself. Subclasses must override this."""
366 # Only nodes of the same class can be equal - this assumption
367 # is used in many places
368 if node.__class__ != other.__class__:
369 return False
370 return self.dispatchClass('rootEq', node, other)
372 def rootEq_UNKNOWN(self, node, other):
373 # Unless we know better two roots of the same type are considered equal
374 return True
376 def childHash(self, node):
377 """Return a hash for the node as a child. Subclasses must override
378 this."""
379 return self.dispatchClass('childHash', node)
381 def childHash_UNKNOWN(self, node):
382 # By default compare as a child by comparing children
383 return self.childrenHash(node)
385 def childEq(self, node, other):
386 """Returns equality of `node` and an `other` node as children.
387 ``True`` if the child features of the two nodes are equal
388 without considering the root. Subclasses must override
389 this."""
390 # Only nodes of the same class can be equal - this assumption
391 # is used in many places
392 if node.__class__ != other.__class__:
393 return False
394 return self.dispatchClass('childEq', node, other)
396 def childEq_UNKNOWN(self, node, other):
397 # By default compare as a child by comparing children
398 return self.childrenEq(node, other)
400 def getChildren(self, node):
401 """Return the children of `node` as a list. Subclasses must override
402 this."""
403 return self.dispatchClass('getChildren', node)
405 def getChildren_UNKNOWN(self, node):
406 return node.children
408 ###########################################################################
409 ###########################################################################
410 # Merging
412 # TODO The resulting class names should be configurable
413 NewDelete = 'removed'
414 NewInsert = 'added'
415 NewReplaced = 'replaced'
416 NewReplacement = 'replacement'
418 def copyRoot(self, node):
419 """Copy `node` as root and return it."""
420 return self.dispatchClass('copyRoot', node)
422 def copyRoot_UNKNOWN(self, node):
423 return node.copy()
425 def addChild(self, root, child):
426 """Add `child` to `root`."""
427 return self.dispatchClass('addChild', root, child)
429 def addChild_UNKNOWN(self, root, child):
430 root.append(child)
432 def copyChild(self, node, newType):
433 """Copy `node` as child and return it. `newType` is ``None`` for an
434 unchanged child or the change type."""
435 return self.dispatchClass('copyChild', node, newType)
437 def copyChild_UNKNOWN(self, node, newType):
438 return self.setNewType(node.deepcopy(), newType)
440 def copyChildren(self, head, tail, root, newType):
441 """Return a range of new nodes copied from [ `head` ] + `tail` under
442 `root`. `tail` are all the same class as `head`. Nodes are
443 created approproate to type `newType`."""
444 return self.dispatchClass('copyChildren', head, tail, root, newType)
446 def copyChildren_UNKNOWN(self, head, tail, root, newType):
447 return [ self.copyChild(child, newType)
448 for child in [ head, ] + tail ]
450 def copyRange(self, root, children, newType):
451 """Return a range of new nodes copied from `children` under `root`.
452 Nodes are created appropriate to type `newType`."""
453 result = [ ]
454 begin = 0
455 while begin < len(children):
456 first = children[begin]
457 end = begin + 1
458 while end < len(children):
459 last = children[end]
460 if not(first.__class__ == last.__class__
461 or (isinstance(first, nodes.Text)
462 and isinstance(last, nodes.Text))):
463 break
464 end += 1
465 result.extend(self.copyChildren(first, children[begin + 1:end],
466 root, newType))
467 begin = end
468 return result
470 def mergeChildren(self, diffRoot, oldRoot, newRoot,
471 command, oldRange, newRange):
472 """Add children to `diffRoot` merging children `oldRange` / `newRange`
473 of `oldRoot` / `newRoot` by `command`."""
474 if command == Opcode.Equal:
475 for old in oldRange:
476 self.addChild(diffRoot, self.copyChild(old, None))
477 elif command == Opcode.Insert or command == Opcode.Delete:
478 if command == Opcode.Insert:
479 srcRoot = newRoot
480 srcRange = newRange
481 newType = self.NewInsert
482 else:
483 srcRoot = oldRoot
484 srcRange = oldRange
485 newType = self.NewDelete
486 for newChild in self.copyRange(srcRoot, srcRange, newType):
487 self.addChild(diffRoot, newChild)
488 elif command == Opcode.Replace:
489 # TODO Replacement doubles elements. This needs to be
490 # reflected properly in the @ids. If the @ids don't change
491 # there need to be unique @ids for replaced elements. This
492 # needs also to be reflected in referring @refid and
493 # @backrefs.
494 for newChild in self.copyRange(oldRoot, oldRange,
495 self.NewReplaced):
496 self.addChild(diffRoot, newChild)
497 for newChild in self.copyRange(newRoot, newRange,
498 self.NewReplacement):
499 self.addChild(diffRoot, newChild)
500 else:
501 raise TypeError("Unhandled command %r" % ( command, ))
503 ###########################################################################
504 ###########################################################################
505 # Helpers
507 def setNewType(self, node, newType):
508 """Set a class on `node` for `newType` if set. Returns `node`."""
509 if newType:
510 node['classes'].append("change-%s" % ( newType, ))
511 return node
513 ###########################################################################
514 ###########################################################################
515 # Real comparison and merging
517 # The idea is like this: Each node has attributes which need to be
518 # compared as root and it has attributes which need to be compared
519 # as child. This is different for every node type.
521 # Similarly each node type may need special methods for cloning
522 # and merging.
524 ###########################################################################
525 # Text / Word / White
527 def rootHash_Text(self, node):
528 return hash(node.astext())
530 rootHash_Word = rootHash_Text
532 def rootHash_White(self, node):
533 # Whitespace compares all equal
534 return hash('')
536 def rootEq_Text(self, node, other):
537 return node.astext() == other.astext()
539 rootEq_Word = rootEq_Text
541 def rootEq_White(self, node, other):
542 # TODO Must behave different for places where whitespace
543 # differences are relevant
544 return True
546 # Text behaves the same as root or child
548 childHash_Text = rootHash_Text
549 childHash_Word = rootHash_Word
550 childHash_White = rootHash_White
552 childEq_Text = rootEq_Text
553 childEq_Word = rootEq_Word
554 childEq_White = rootEq_White
556 def copyChildren_Text(self, head, tail, root, newType):
557 if not tail and isinstance(head, nodes.Text) and not head.astext():
558 # Do not create empty inlines
559 return [ ]
560 inline = nodes.inline()
561 self.setNewType(inline, newType)
562 inline.extend([ head, ] + tail)
563 return [ inline, ]
565 # Sequences of Text are treated together
566 copyChildren_Word = copyChildren_Text
567 copyChildren_White = copyChildren_Text
569 ###########################################################################
570 # For some elements their attributes need to be considered to
571 # detect changes.
573 def attributeEq(self, node, other, attribute):
574 if (attribute in node) != (attribute in other):
575 return False
576 if not attribute in node:
577 return True
578 return node[attribute] == other[attribute]
580 ###########################################################################
581 # reference
583 def rootEq_reference(self, node, other):
584 return self.attributeEq(node, other, 'refuri')
586 ###########################################################################
587 # target
589 def rootEq_target(self, node, other):
590 return self.attributeEq(node, other, 'refuri')
592 ###########################################################################
593 # bullet_list
595 # TODO This is typically a minor change and should be requested by
596 # a special option
598 def attributeEq_bullet_list(self, node, other):
599 return self.attributeEq(node, other, 'bullet')
601 def rootEq_bullet_list(self, node, other):
602 return self.attributeEq_bullet_list(node, other)
604 def childEq_bullet_list(self, node, other):
605 return (self.attributeEq_bullet_list(node, other)
606 and self.childrenEq(node, other))
608 ###########################################################################
609 # enumerated_list
611 # TODO This is typically a minor change and should be requested by
612 # a special option
614 def attributeEq_enumerated_list(self, node, other):
615 return (self.attributeEq(node, other, 'enumtype')
616 and self.attributeEq(node, other, 'prefix')
617 and self.attributeEq(node, other, 'suffix')
618 and self.attributeEq(node, other, 'start'))
620 def rootEq_enumerated_list(self, node, other):
621 return self.attributeEq_enumerated_list(node, other)
623 def childEq_enumerated_list(self, node, other):
624 return (self.attributeEq_enumerated_list(node, other)
625 and self.childrenEq(node, other))
627 ###########################################################################
628 # image
630 def rootEq_image(self, node, other):
631 if node.__class__ != other.__class__:
632 return False
633 return self.attributeEq(node, other, 'uri')
635 ###########################################################################
636 # Some elements may contain only #PCDATA. They need to propagate
637 # changes in their children up to the element itself.
639 def rootEqWithChildren(self, node, other):
640 if node.__class__ != other.__class__:
641 return False
642 return self.childrenEq(node, other)
644 ###########################################################################
645 # comment
647 rootEq_comment = rootEqWithChildren
649 ###########################################################################
650 # literal
652 rootEq_literal = rootEqWithChildren
654 ###########################################################################
655 # option_string
657 rootEq_option_string = rootEqWithChildren
659 ###########################################################################
660 # label
662 # TODO This is typically a minor change and should be requested by
663 # a special option
665 rootEq_label = rootEqWithChildren
667 ###########################################################################
668 # footnote_reference
670 # TODO This is typically a minor change and should be requested by
671 # a special option
673 rootEq_footnote_reference = rootEqWithChildren
675 ###########################################################################
676 # citation_reference
678 # TODO This is typically a minor change and should be requested by
679 # a special option
681 rootEq_citation_reference = rootEqWithChildren
683 ###########################################################################
684 # For some elements their attributes need to be considered to
685 # detect changes *and* they may contain only #PCDATA.
687 ###########################################################################
688 # option_argument
690 # TODO This is typically a minor change and should be requested by
691 # a special option
693 def attributeEq_option_argument(self, node, other):
694 return self.attributeEq(node, other, 'delimiter')
696 def rootEq_option_argument(self, node, other):
697 return (self.attributeEq_option_argument(node, other)
698 and self.rootEqWithChildren(node, other))
700 def childEq_option_argument(self, node, other):
701 return (self.attributeEq_option_argument(node, other)
702 and self.childrenEq(node, other))
704 ###########################################################################
705 # A change in certain elements must propagate the change up since
706 # they may occur only once. Must be done by parents.
708 # Checks whether `node` and `other` have both a node of type
709 # `childClass` and whether the first of thosee are equal.
710 def rootEqWithChild(self, node, other, childClass):
711 if node.__class__ != other.__class__:
712 return False
714 nodeFound = None
715 for nodeChild in self.getChildren(node):
716 if isinstance(nodeChild, childClass):
717 nodeFound = nodeChild
718 break
720 otherFound = None
721 for otherChild in self.getChildren(other):
722 if isinstance(otherChild, childClass):
723 otherFound = otherChild
724 break
726 if nodeFound is None or otherFound is None:
727 return True
729 return self.childEq(nodeFound, otherFound)
731 ###########################################################################
732 # footnote
734 def rootEq_footnote(self, node, other):
735 return self.rootEqWithChild(node, other, nodes.label)
737 ###########################################################################
738 # citation
740 def rootEq_citation(self, node, other):
741 return self.rootEqWithChild(node, other, nodes.label)
743 ###########################################################################
744 # option
746 def rootEq_option(self, node, other):
747 return self.rootEqWithChild(node, other, nodes.option_string)
749 ###############################################################################
750 ###############################################################################
751 # Main
753 def processCommandLine():
754 """Process command line and return a `Publisher`."""
755 # Determine writer here so options can be given normally
756 preWriter = writerDefault
757 for arg in sys.argv:
758 match = re.search(writerArgRE1, arg)
759 if match:
760 preWriter = match.group(1)
762 pub = Publisher3Args()
763 pub.set_reader('standalone', None, 'restructuredtext')
764 pub.set_writer(preWriter)
766 settingsSpec = SettingsSpec()
767 settingsSpec.settings_spec = settings_spec
768 settingsSpec.settings_defaults = settings_defaults
769 pub.process_command_line(usage=usage, description=description,
770 settings_spec=settingsSpec,
771 config_section=config_section)
772 if pub.settings.writer != preWriter:
773 new_reporter('<cmdline>',
774 pub.settings).severe("Internal error: Mismatch of pre-parsed (%r) and real (%r) writer"
775 % ( preWriter, pub.settings.writer, ))
776 pub.set_destination()
777 return pub
779 def readTree(pub, sourceName):
780 """Read and return a tree from `sourceName`."""
781 # Reset reader - just in case it keeps state from a previous invocation
782 pub.set_reader('standalone', None, 'restructuredtext')
783 pub.set_source(None, sourceName)
784 pub.document = None
785 pub.document = pub.reader.read(pub.source, pub.parser, pub.settings)
786 pub.apply_transforms()
787 return pub.document
789 def doDiff(hashableNodeImpl, oldTree, newTree):
790 """Create a difference from `oldTree` to `newTree` using
791 `hashableNodeImpl`. Returns the opcodes necessary to transform
792 `oldTree` to `newTree`."""
793 matcher = TreeMatcher(hashableNodeImpl, oldTree, newTree,
794 lambda node: isinstance(node, White))
795 return matcher.get_opcodes()
797 def buildDocument(oldTree, newTree, opcodes, settings):
798 """Returns a new document for the result of converting `oldTree` to
799 `newTree` using `opcodes` as description of changes."""
800 if (not isinstance(oldTree, docutils.nodes.document)
801 or not isinstance(newTree, docutils.nodes.document)):
802 raise TypeError("Roots of trees must be documents")
803 return new_document(u"%s => %s"
804 % ( settings._old_source, settings._new_source, ),
805 settings)
807 def buildTree(dispatcher, diffRoot, opcodes, oldRoot, newRoot):
808 """Adds a new sub-tree under `diffRoot` converting children of
809 `oldRoot` to `newRoot` using `opcodes`."""
810 oldChildren = dispatcher.getChildren(oldRoot)
811 newChildren = dispatcher.getChildren(newRoot)
812 for opcode in opcodes:
813 ( command, oldRange, newRange,
814 subOpcodes, ) = Opcode(opcode).resolveOpcode(oldChildren, newChildren)
815 if command == Opcode.Descend:
816 child = dispatcher.copyRoot(oldRange[0])
817 dispatcher.addChild(diffRoot, child)
818 buildTree(dispatcher, child,
819 subOpcodes, oldRange[0], newRange[0])
820 else:
821 dispatcher.mergeChildren(diffRoot, oldRoot, newRoot,
822 command, oldRange, newRange)
824 # A replacement in certain elements must not be propagated up since
825 # they may occur only once and replacement would double them
826 replaceNotUp = ( nodes.title, nodes.subtitle, nodes.term, nodes.field_name,
827 nodes.attribution, nodes.caption, # (%text.model)
828 nodes.header, nodes.footer, nodes.definition,
829 nodes.field_body, nodes.description, nodes.legend,
830 nodes.entry, # (%body.elements;+) or (%body.elements;*)
831 nodes.decoration, nodes.docinfo, nodes.transition,
832 nodes.option_group, nodes.thead,
833 nodes.tbody, # different content model
836 # A replacement in certain elements normally not subject to up
837 # propagation and contained in certain elements may propagate up if
838 # all their siblings are also replacements and would propagate up
839 replaceUpSiblings = (
840 ( nodes.title, nodes.section, ),
841 ( nodes.subtitle, nodes.section, ),
842 ( nodes.term, nodes.definition_list_item, ),
843 ( nodes.field_name, nodes.field, ),
844 ( nodes.attribution, nodes.block_quote, ),
845 ( nodes.caption, nodes.figure, ),
846 ( nodes.definition, nodes.definition_list_item, ),
847 ( nodes.field_body, nodes.field, ),
848 ( nodes.description, nodes.option_list_item, ),
849 ( nodes.legend, nodes.figure, ),
850 ( nodes.option_group, nodes.option_list_item, ),
853 # TODO If much text is replaced in a text element the whole element
854 # should be replaced. This makes more sense to people than two large
855 # replaced/replacement blocks where the only equality is in words like
856 # "the". The exact meaning of "much" should be an option.
857 def cleanOpcodes(opcodes, dispatcher, oldList, newList):
858 """Replace some nasty results in `opcodes` by cleaner versions. Opcodes
859 create `newList` from `oldList`."""
860 mightReplaceUpSiblings = [ ]
861 for i in range(len(opcodes)):
862 opcode = Opcode(opcodes[i])
863 ( command, oldRange, newRange, subOpcodes,
864 ) = opcode.resolveOpcode(oldList, newList)
865 if not subOpcodes:
866 # Nothing to clean for flat or empty opcodes
867 continue
869 oldNode = oldRange[0]
870 newNode = newRange[0]
871 cleanOpcodes(subOpcodes, dispatcher, dispatcher.getChildren(oldNode),
872 dispatcher.getChildren(newNode))
873 j = 1
874 while j < len(subOpcodes):
875 prev = Opcode(subOpcodes[j - 1])
876 this = Opcode(subOpcodes[j])
877 if (this.getCommand() != Opcode.Descend
878 and prev.getCommand() == this.getCommand()):
879 # Merge adjacing opcodes of same type
880 prevOld = prev.getOldRange()
881 prevNew = prev.getNewRange()
882 thisOld = this.getOldRange()
883 thisNew = this.getNewRange()
884 prev.setOldRange(( prevOld[0], thisOld[1], ))
885 prev.setNewRange(( prevNew[0], thisNew[1], ))
886 subOpcodes[j - 1:j + 1] = [ prev.asTuple(), ]
887 else:
888 j += 1
889 opcode.setSubOpcodes(subOpcodes)
890 if len(subOpcodes) == 1:
891 subOpcode = Opcode(subOpcodes[0])
892 if subOpcode.getCommand() == Opcode.Descend:
893 propagateUp = False
894 elif subOpcode.getCommand() == Opcode.Replace:
895 if any([ isinstance(oldNode, cls)
896 for cls in replaceNotUp ]):
897 propagateUp = False
898 if any([ isinstance(oldNode, cls)
899 and isinstance(oldNode.parent, parentCls)
900 for ( cls, parentCls, ) in replaceUpSiblings ]):
901 # If for instance a section/title would
902 # propagate a replacement up the propagation
903 # needs to be done if all siblings would
904 # also propagate a replacement up
905 mightReplaceUpSiblings.append(i)
906 else:
907 propagateUp = True
908 else:
909 propagateUp = True
910 if propagateUp:
911 # Propagate 1-element sequences up
912 opcode.setCommand(subOpcode.getCommand())
913 opcodes[i] = opcode.asTuple()
915 if mightReplaceUpSiblings:
916 # There are entries which might propagate a replace up if all
917 # siblings could do as well
918 if all([ i in mightReplaceUpSiblings
919 or Opcode(opcodes[i]).getCommand() == Opcode.Replace
920 for i in range(len(opcodes)) ]):
921 # All entries are replacements which may propagate up -
922 # actually propagate elements which may propagate
923 for i in mightReplaceUpSiblings:
924 opcode = Opcode(opcodes[i])
925 opcode.setCommand(Opcode.Replace)
926 opcodes[i] = opcode.asTuple()
928 def createDiff(oldTree, newTree, debug=False):
929 """Create and return a diff document from `oldTree` to `newTree`."""
930 dispatcher = DocutilsDispatcher()
931 dispatcher.debug = debug
932 opcodes = doDiff(dispatcher, oldTree, newTree)
933 if debug:
934 from pprint import pprint
935 print(oldTree.asdom().toprettyxml())
936 print(newTree.asdom().toprettyxml())
937 pprint(opcodes, sys.stdout, 2, 40, None)
938 print("^^^ Before cleaning vvv After cleaning")
939 cleanOpcodes(opcodes, dispatcher, [ oldTree ], [ newTree ])
940 if debug:
941 from pprint import pprint
942 pprint(opcodes, sys.stdout, 2, 40, None)
943 if len(opcodes) != 1:
944 raise TypeError("Don't know how to merge documents which are not rootEq")
945 opcode = Opcode(opcodes[0])
946 if opcode.getCommand() not in ( Opcode.Descend, Opcode.Equal, ):
947 # TODO There should be a sense making message for this case
948 # because this may happen due to up propagation of replacements
949 raise TypeError("Don't know how to merge top level opcode of type %r"
950 % ( opcode.getCommand(), ))
952 diffDoc = buildDocument(oldTree, newTree, opcodes, pub.settings)
953 if opcode.getCommand() == Opcode.Equal:
954 # TODO Equality should be reported somehow
955 diffDoc.extend([ child.deepcopy()
956 for child in newTree.children ])
957 else:
958 buildTree(dispatcher, diffDoc, opcode.getSubOpcodes(), oldTree, newTree)
959 return diffDoc
961 if __name__ == '__main__':
962 pub = processCommandLine()
964 oldTree = readTree(pub, pub.settings._old_source)
965 newTree = readTree(pub, pub.settings._new_source)
967 Text2Words(oldTree).apply()
968 Text2Words(newTree).apply()
970 diffDoc = createDiff(oldTree, newTree)
971 Words2Text(diffDoc).apply()
973 pub.writer.write(diffDoc, pub.destination)
974 pub.writer.assemble_parts()
976 # TODO The CSS classes need to be set in a CSS stylesheet