3 # Copyright (C) 2010 Stefan Merten
5 # rstdiff.py is free software; you can redistribute it and/or modify
6 # it under the terms of the GNU General Public License as published
7 # by the Free Software Foundation; either version 2 of the License,
8 # or (at your option) any later version.
10 # This program is distributed in the hope that it will be useful,
11 # but WITHOUT ANY WARRANTY; without even the implied warranty of
12 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 # General Public License for more details.
15 # You should have received a copy of the GNU General Public License
16 # along with this program; if not, write to the Free Software
17 # Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA
21 Generates a structural diff from two reStructuredText input documents
22 and produces an annotated result.
25 __docformat__
= 'reStructuredText'
29 locale
.setlocale(locale
.LC_ALL
, '')
36 from docutils
import frontend
, writers
, nodes
, SettingsSpec
37 from docutils
.core
import Publisher
38 from docutils
.utils
import SystemMessage
, Reporter
, new_reporter
, new_document
39 from docutils
.frontend
import OptionParser
, make_paths_absolute
40 from docutils
.transforms
import Transform
42 from treediff
import TreeMatcher
, HashableNodeImpl
44 ###############################################################################
45 ###############################################################################
46 # Command line specification
48 description
= ("""Generates a structural diff from two reStructuredText input
49 documents and produces an annotated result. """)
51 writerOption
= 'writer'
53 writerArgRE1
= '^--' + writerOption
+ '=' + '(.*)$'
58 (('Select writer to write output with (default "xml").',
59 ['--' + writerOption
],
64 settings_defaults
= {'output_encoding_error_handler': 'xmlcharrefreplace',
65 writerOption
: writerDefault
}
67 config_section
= 'rstdiff'
69 usage
= '%prog [options]... <old> [<new> [<output>]]'
71 ###############################################################################
72 # Classes for three argument command lines
74 class Publisher3Args(Publisher
):
76 def setup_option_parser(self
, usage
=None, description
=None,
77 settings_spec
=None, config_section
=None,
81 settings_spec
= SettingsSpec()
82 settings_spec
.config_section
= config_section
83 parts
= config_section
.split()
84 if len(parts
) > 1 and parts
[-1] == 'application':
85 settings_spec
.config_section_dependencies
= ['applications']
86 #@@@ Add self.source & self.destination to components in future?
87 option_parser
= OptionParser3Args(
88 components
=(self
.parser
, self
.reader
, self
.writer
, settings_spec
),
89 defaults
=defaults
, read_config_files
=1,
90 usage
=usage
, description
=description
)
93 class OptionParser3Args(OptionParser
):
95 def check_values(self
, values
, args
):
96 """Store positional arguments as runtime settings."""
97 values
._old
_source
, values
._new
_source
, values
._destination
= self
.check_args(args
)
98 make_paths_absolute(values
.__dict
__, self
.relative_path_settings
,
100 values
._config
_files
= self
.config_files
103 def check_args(self
, args
):
104 old_source
= new_source
= destination
= None
106 self
.error('At least 1 argument required.')
108 old_source
= args
.pop(0)
109 if old_source
== '-': # means stdin
112 new_source
= args
.pop(0)
113 if new_source
== '-': # means stdin
116 destination
= args
.pop(0)
117 if destination
== '-': # means stdout
120 self
.error('Maximum 3 arguments allowed.')
121 if old_source
is None and new_source
is None:
122 self
.error('Old and new source may not both use stdin.')
123 if (old_source
and old_source
== destination
124 or new_source
and new_source
== destination
):
125 self
.error('Do not specify the same file for both source and '
126 'destination. It will clobber the source file.')
127 return old_source
, new_source
, destination
129 ###############################################################################
130 ###############################################################################
133 class Opcode(object):
134 """Encapsulates opcodes as returned by `TreeMatcher.get_opcodes()`"""
144 def __init__(self
, opcodeTuple
):
145 """Initialize from a tuple returned by `TreeMatcher.get_opcodes()`"""
146 self
._tuple
= list(opcodeTuple
)
148 def getCommand(self
):
149 """Return the command."""
150 return self
._tuple
[0]
152 def getOldRange(self
):
153 """Returns the range pertaining to an old list."""
154 return ( self
._tuple
[1], self
._tuple
[2], )
156 def getNewRange(self
):
157 """Returns the range pertaining to a new list."""
158 return ( self
._tuple
[3], self
._tuple
[4], )
160 def getSubOpcodes(self
):
161 """Return the sub-opcodes in case of `command` == 'descend' or
163 if self
._tuple
[0] != self
.Descend
:
165 return self
._tuple
[5]
167 def resolveOpcode(self
, oldList
, newList
):
168 """Resolves opcode pertaining to `oldList` and `newList`. Returns tuple
172 Same as self.getCommand().
175 The range of elements in `oldList` affected by the opcode.
178 The range of elements in `newList` affected by the opcode.
181 Same as self.getSubOpcodes().
183 oldRange
= self
.getOldRange()
184 newRange
= self
.getNewRange()
185 return ( self
.getCommand(), oldList
[oldRange
[0]:oldRange
[1]],
186 newList
[newRange
[0]:newRange
[1]], self
.getSubOpcodes())
188 def setSubOpcodes(self
, opcodes
):
189 """Set the sub-opcodes to a new list."""
190 if self
._tuple
[0] != self
.Descend
:
191 raise TypeError("Can not set subopcodes of a %r opcode"
192 % ( self
._tuple
[0], ))
193 self
._tuple
[5] = opcodes
195 def setCommand(self
, command
):
196 """Set a new command adapting subopcodes."""
197 if self
._tuple
[0] == command
:
199 self
._tuple
[0] = command
200 if command
== self
.Descend
:
203 self
._tuple
= self
._tuple
[0:5]
205 def setOldRange(self
, range):
206 """Sets the range pertaining to an old list."""
207 ( self
._tuple
[1], self
._tuple
[2], ) = range
209 def setNewRange(self
, range):
210 """Sets the range pertaining to a new list."""
211 ( self
._tuple
[3], self
._tuple
[4], ) = range
214 """Return the opcode as a tuple."""
215 return tuple(self
._tuple
)
217 ###############################################################################
218 ###############################################################################
219 # Additional docutils stuff
221 ###############################################################################
224 class White(nodes
.Text
):
225 """A piece of text containing only whitespace."""
229 """A regular expression matching strings for this class and returning
230 them as the first match."""
231 # TODO Could be subject to an option
234 class Word(nodes
.Text
):
235 """A piece of text containing exactly one word."""
241 """Splits text and returns a sequence of `Word` and `White`
242 objects. Returns an empty sequence for an empty `text`."""
244 subs
= re
.split(White
.re
, text
.astext())
248 elif re
.match(White
.re
, subs
[0]):
249 ( current
, next
, ) = ( White
, Word
, )
251 ( current
, next
, ) = ( Word
, White
, )
253 result
.append(current(sub
))
254 ( current
, next
, ) = ( next
, current
, )
257 ###############################################################################
260 class Text2Words(Transform
):
261 """Transforms a `Text` node into a sequence of `Word`/`White`."""
264 self
.document
.walk(Text2WordsVisitor(self
.document
))
266 class Text2WordsVisitor(nodes
.SparseNodeVisitor
):
268 def visit_Text(self
, text
):
269 words
= Word
.splitText(text
)
272 words
= [ White(''), ]
273 text
.parent
.replace(text
, words
)
275 class Words2Text(Transform
):
276 """Transforms a sequence of `Word`/`White` into a `Text` node."""
279 self
.document
.walk(Words2TextVisitor(self
.document
))
281 class Words2TextVisitor(nodes
.SparseNodeVisitor
):
283 def visit_Word(self
, word
):
285 # Find this node and the first node of the sequence it belongs to
287 for i
in range(len(parent
)):
288 if not isinstance(parent
[i
], nodes
.Text
):
292 # ``parent.index(word)`` uses value equality - can not be
293 # used here to find `word`
294 if id(parent
[i
]) == id(word
):
298 raise IndexError("Can not find %r in its parent" % ( word
, ))
300 if (len(parent
) > end
301 and isinstance(parent
[end
], nodes
.Text
)):
302 # The visitor processes following children even if they are
303 # deleted - so work for last node of a sequence
306 text
= nodes
.Text(reduce(lambda s
, node
: s
+ node
.astext(),
307 parent
[first
:end
], ""))
308 parent
[first
:end
] = ( text
, )
310 visit_White
= visit_Word
312 ###############################################################################
313 ###############################################################################
316 class DocutilsDispatcher(HashableNodeImpl
):
317 """Implements hashable for a docutils `Node` and supports construction."""
322 super(self
.__class
__, self
).__init
__(nodes
.Node
)
324 def dispatchClass(self
, function
, node
, *args
):
325 """Dispatch a call of type `function` for the class of `node` using
326 arguments `node` and `args`. Default is to dispatch for imaginary class
328 pat
= "%s_%%s" % ( function
, )
330 name
= pat
% ( node
.__class
__.__name
__, )
331 method
= getattr(self
, name
)
332 except AttributeError:
333 name
= pat
% ( 'UNKNOWN', )
334 method
= getattr(self
, name
)
337 % ( name
, ", ".join([ arg
.__class
__.__name
__
338 for arg
in ( node
, ) + args
]), ))
339 for arg
in ( node
, ) + args
:
340 print(" %s" % ( arg
, ))
341 result
= method(node
, *args
)
343 print(" %s" % ( result
, ))
346 ###########################################################################
347 ###########################################################################
348 # Implementation of abstract methods for `HashableNodeImpl`
350 def rootHash(self
, node
):
351 """Return a hash for the root only. Subclasses must override
353 return self
.dispatchClass('rootHash', node
)
355 def rootHash_UNKNOWN(self
, node
):
356 return hash(node
.__class
__)
358 def rootEq(self
, node
, other
):
359 """Returns root equality of `node` and an `other` node. ``True`` if
360 the two nodes as roots are equal without considering their
361 children. This should be true if one node can be replaced by
362 the other and all changes can be represented without changing
363 the node itself. Subclasses must override this."""
364 # Only nodes of the same class can be equal - this assumption
365 # is used in many places
366 if node
.__class
__ != other
.__class
__:
368 return self
.dispatchClass('rootEq', node
, other
)
370 def rootEq_UNKNOWN(self
, node
, other
):
371 # Unless we know better two roots of the same type are considered equal
374 def childHash(self
, node
):
375 """Return a hash for the node as a child. Subclasses must override
377 return self
.dispatchClass('childHash', node
)
379 def childHash_UNKNOWN(self
, node
):
380 # By default compare as a child by comparing children
381 return self
.childrenHash(node
)
383 def childEq(self
, node
, other
):
384 """Returns equality of `node` and an `other` node as children.
385 ``True`` if the child features of the two nodes are equal
386 without considering the root. Subclasses must override
388 # Only nodes of the same class can be equal - this assumption
389 # is used in many places
390 if node
.__class
__ != other
.__class
__:
392 return self
.dispatchClass('childEq', node
, other
)
394 def childEq_UNKNOWN(self
, node
, other
):
395 # By default compare as a child by comparing children
396 return self
.childrenEq(node
, other
)
398 def getChildren(self
, node
):
399 """Return the children of `node` as a list. Subclasses must override
401 return self
.dispatchClass('getChildren', node
)
403 def getChildren_UNKNOWN(self
, node
):
406 ###########################################################################
407 ###########################################################################
410 NewDelete
= 'removed'
412 NewReplaced
= 'replaced'
413 NewReplacement
= 'replacement'
415 def copyRoot(self
, node
):
416 """Copy `node` as root and return it."""
417 return self
.dispatchClass('copyRoot', node
)
419 def copyRoot_UNKNOWN(self
, node
):
422 def addChild(self
, root
, child
):
423 """Add `child` to `root`."""
424 return self
.dispatchClass('addChild', root
, child
)
426 def addChild_UNKNOWN(self
, root
, child
):
429 def copyChild(self
, node
, newType
):
430 """Copy `node` as child and return it. `newType` is ``None`` for an
431 unchanged child or the change type."""
432 return self
.dispatchClass('copyChild', node
, newType
)
434 def copyChild_UNKNOWN(self
, node
, newType
):
435 return self
.setNewType(node
.deepcopy(), newType
)
437 def copyChildren(self
, head
, tail
, root
, newType
):
438 """Return a range of new nodes copied from [ `head` ] + `tail` under
439 `root`. `tail` are all the same class as `head`. Nodes are
440 created approproate to type `newType`."""
441 return self
.dispatchClass('copyChildren', head
, tail
, root
, newType
)
443 def copyChildren_UNKNOWN(self
, head
, tail
, root
, newType
):
444 return [ self
.copyChild(child
, newType
)
445 for child
in [ head
, ] + tail
]
447 def copyRange(self
, root
, children
, newType
):
448 """Return a range of new nodes copied from `children` under `root`.
449 Nodes are created appropriate to type `newType`."""
452 while begin
< len(children
):
453 first
= children
[begin
]
455 while end
< len(children
):
457 if not(first
.__class
__ == last
.__class
__
458 or (isinstance(first
, nodes
.Text
)
459 and isinstance(last
, nodes
.Text
))):
462 result
.extend(self
.copyChildren(first
, children
[begin
+ 1:end
],
467 def mergeChildren(self
, diffRoot
, oldRoot
, newRoot
,
468 command
, oldRange
, newRange
):
469 """Add children to `diffRoot` merging children `oldRange` / `newRange`
470 of `oldRoot` / `newRoot` by `command`."""
471 if command
== Opcode
.Equal
:
473 self
.addChild(diffRoot
, self
.copyChild(old
, None))
474 elif command
== Opcode
.Insert
or command
== Opcode
.Delete
:
475 if command
== Opcode
.Insert
:
478 newType
= self
.NewInsert
482 newType
= self
.NewDelete
483 for newChild
in self
.copyRange(srcRoot
, srcRange
, newType
):
484 self
.addChild(diffRoot
, newChild
)
485 elif command
== Opcode
.Replace
:
486 for newChild
in self
.copyRange(oldRoot
, oldRange
,
488 self
.addChild(diffRoot
, newChild
)
489 for newChild
in self
.copyRange(newRoot
, newRange
,
490 self
.NewReplacement
):
491 self
.addChild(diffRoot
, newChild
)
493 raise TypeError("Unhandled command %r" % ( command
, ))
495 ###########################################################################
496 ###########################################################################
499 def setNewType(self
, node
, newType
):
500 """Set a class on `node` for `newType` if set. Returns `node`."""
502 node
['classes'].append("change-%s" % ( newType
, ))
505 ###########################################################################
506 ###########################################################################
507 # Real comparison and merging
509 # The idea is like this: Each node has attributes which need to be
510 # compared as root and it has attributes which need to be compared
511 # as child. This is different for every node type.
513 # Similarly each node type may need special methods for cloning
516 ###########################################################################
517 # Text / Word / White
519 def rootHash_Text(self
, node
):
520 return hash(node
.astext())
522 rootHash_Word
= rootHash_Text
524 def rootHash_White(self
, node
):
525 # Whitespace compares all equal
528 def rootEq_Text(self
, node
, other
):
529 return node
.astext() == other
.astext()
531 rootEq_Word
= rootEq_Text
533 def rootEq_White(self
, node
, other
):
534 # TODO Must behave different for places where whitespace
535 # differences are relevant
538 # Text behaves the same as root or child
540 childHash_Text
= rootHash_Text
541 childHash_Word
= rootHash_Word
542 childHash_White
= rootHash_White
544 childEq_Text
= rootEq_Text
545 childEq_Word
= rootEq_Word
546 childEq_White
= rootEq_White
548 def copyChildren_Text(self
, head
, tail
, root
, newType
):
549 inline
= nodes
.inline()
550 self
.setNewType(inline
, newType
)
551 inline
.extend([ head
, ] + tail
)
554 # Sequences of Text are treated together
555 copyChildren_Word
= copyChildren_Text
556 copyChildren_White
= copyChildren_Text
558 ###############################################################################
559 ###############################################################################
562 def processCommandLine():
563 """Process command line and return a `Publisher`."""
564 # Determine writer here so options can be given normally
565 preWriter
= writerDefault
567 match
= re
.search(writerArgRE1
, arg
)
569 preWriter
= match
.group(1)
571 pub
= Publisher3Args()
572 pub
.set_reader('standalone', None, 'restructuredtext')
573 pub
.set_writer(preWriter
)
575 settingsSpec
= SettingsSpec()
576 settingsSpec
.settings_spec
= settings_spec
577 settingsSpec
.settings_defaults
= settings_defaults
578 pub
.process_command_line(usage
=usage
, description
=description
,
579 settings_spec
=settingsSpec
,
580 config_section
=config_section
)
581 if pub
.settings
.writer
!= preWriter
:
582 new_reporter('<cmdline>',
583 pub
.settings
).severe("Internal error: Mismatch of pre-parsed (%r) and real (%r) writer"
584 % ( preWriter
, pub
.settings
.writer
, ))
585 pub
.set_destination()
588 def readTree(pub
, sourceName
):
589 """Read and return a tree from `sourceName`."""
590 # Reset reader - just in case it keeps state from a previous invocation
591 pub
.set_reader('standalone', None, 'restructuredtext')
592 pub
.set_source(None, sourceName
)
594 pub
.document
= pub
.reader
.read(pub
.source
, pub
.parser
, pub
.settings
)
595 pub
.apply_transforms()
598 def doDiff(hashableNodeImpl
, oldTree
, newTree
):
599 """Create a difference from `oldTree` to `newTree` using
600 `hashableNodeImpl`. Returns the opcodes necessary to transform
601 `oldTree` to `newTree`."""
602 matcher
= TreeMatcher(hashableNodeImpl
, oldTree
, newTree
,
603 lambda node
: isinstance(node
, White
))
604 return matcher
.get_opcodes()
606 def buildDocument(oldTree
, newTree
, opcodes
, settings
):
607 """Returns a new document for the result of converting `oldTree` to
608 `newTree` using `opcodes` as description of changes."""
609 if (not isinstance(oldTree
, docutils
.nodes
.document
)
610 or not isinstance(newTree
, docutils
.nodes
.document
)):
611 raise TypeError("Roots of trees must be documents")
612 return new_document(u
"%s => %s"
613 % ( settings
._old
_source
, settings
._new
_source
, ),
616 def buildTree(dispatcher
, diffRoot
, opcodes
, oldRoot
, newRoot
):
617 """Adds a new sub-tree under `diffRoot` converting children of
618 `oldRoot` to `newRoot` using `opcodes`."""
619 oldChildren
= dispatcher
.getChildren(oldRoot
)
620 newChildren
= dispatcher
.getChildren(newRoot
)
621 for opcode
in opcodes
:
622 ( command
, oldRange
, newRange
,
623 subOpcodes
, ) = Opcode(opcode
).resolveOpcode(oldChildren
, newChildren
)
624 if command
== Opcode
.Descend
:
625 child
= dispatcher
.copyRoot(oldRange
[0])
626 dispatcher
.addChild(diffRoot
, child
)
627 buildTree(dispatcher
, child
,
628 subOpcodes
, oldRange
[0], newRange
[0])
630 dispatcher
.mergeChildren(diffRoot
, oldRoot
, newRoot
,
631 command
, oldRange
, newRange
)
633 def cleanOpcodes(opcodes
):
634 """Replace some nasty results in `opcodes` by cleaner versions."""
635 for i
in range(len(opcodes
)):
636 opcode
= Opcode(opcodes
[i
])
637 subOpcodes
= opcode
.getSubOpcodes()
639 # Nothing to clean for flat opcodes
642 cleanOpcodes(subOpcodes
)
644 while j
< len(subOpcodes
):
645 prev
= Opcode(subOpcodes
[j
- 1])
646 this
= Opcode(subOpcodes
[j
])
647 if (this
.getCommand() != Opcode
.Descend
648 and prev
.getCommand() == this
.getCommand()):
649 # Merge adjacing opcodes of same type
650 prevOld
= prev
.getOldRange()
651 prevNew
= prev
.getNewRange()
652 thisOld
= this
.getOldRange()
653 thisNew
= this
.getNewRange()
654 prev
.setOldRange(( prevOld
[0], thisOld
[1], ))
655 prev
.setNewRange(( prevNew
[0], thisNew
[1], ))
656 subOpcodes
[j
- 1:j
+ 1] = [ prev
.asTuple(), ]
659 opcode
.setSubOpcodes(subOpcodes
)
660 if len(subOpcodes
) == 1:
661 subOpcode
= Opcode(subOpcodes
[0])
662 if subOpcode
.getCommand() != Opcode
.Descend
:
663 # Propagate 1-element sequences up
664 opcode
.setCommand(subOpcode
.getCommand())
665 opcodes
[i
] = opcode
.asTuple()
667 def createDiff(oldTree
, newTree
):
668 """Create and return a diff document from `oldTree` to `newTree`."""
669 dispatcher
= DocutilsDispatcher()
670 #dispatcher.debug = True
671 opcodes
= doDiff(dispatcher
, oldTree
, newTree
)
673 from pprint
import pprint
674 print(oldTree
.asdom().toprettyxml())
675 print(newTree
.asdom().toprettyxml())
676 pprint(opcodes
, sys
.stdout
, 2, 40, None)
677 print("^^^ Before cleaning vvv After cleaning")
678 cleanOpcodes(opcodes
)
680 from pprint
import pprint
681 pprint(opcodes
, sys
.stdout
, 2, 40, None)
682 if len(opcodes
) != 1:
683 raise TypeError("Don't how to merge documents which are not rootEq")
684 opcode
= Opcode(opcodes
[0])
685 if opcode
.getCommand() not in ( Opcode
.Descend
, Opcode
.Equal
, ):
686 raise TypeError("Don't how to merge top level opcode of type %r"
687 % ( opcode
.getCommand(), ))
689 diffDoc
= buildDocument(oldTree
, newTree
, opcodes
, pub
.settings
)
690 if opcode
.getCommand() == Opcode
.Equal
:
691 # TODO Equality should be reported somehow
692 diffDoc
.extend([ child
.deepcopy()
693 for child
in newTree
.children
])
695 buildTree(dispatcher
, diffDoc
, opcode
.getSubOpcodes(), oldTree
, newTree
)
698 if __name__
== '__main__':
699 pub
= processCommandLine()
701 oldTree
= readTree(pub
, pub
.settings
._old
_source
)
702 newTree
= readTree(pub
, pub
.settings
._new
_source
)
704 Text2Words(oldTree
).apply()
705 Text2Words(newTree
).apply()
707 diffDoc
= createDiff(oldTree
, newTree
)
708 Words2Text(diffDoc
).apply()
710 pub
.writer
.write(diffDoc
, pub
.destination
)
711 pub
.writer
.assemble_parts()