Standard transformations are done. Tests added. HTML output works.
[docutils/kirr.git] / sandbox / rstdiff / rstdiff.py
blob6866d332faff81619daece05df15052e251b9591
1 #!/usr/bin/env python
3 # Copyright (C) 2010 Stefan Merten
5 # rstdiff.py is free software; you can redistribute it and/or modify
6 # it under the terms of the GNU General Public License as published
7 # by the Free Software Foundation; either version 2 of the License,
8 # or (at your option) any later version.
10 # This program is distributed in the hope that it will be useful,
11 # but WITHOUT ANY WARRANTY; without even the implied warranty of
12 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 # General Public License for more details.
15 # You should have received a copy of the GNU General Public License
16 # along with this program; if not, write to the Free Software
17 # Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA
18 # 02111-1307, USA.
20 """
21 Generates a structural diff from two reStructuredText input documents
22 and produces an annotated result.
23 """
25 __docformat__ = 'reStructuredText'
27 try:
28 import locale
29 locale.setlocale(locale.LC_ALL, '')
30 except:
31 pass
33 import os, re, sys
35 import docutils
36 from docutils import frontend, writers, nodes, SettingsSpec
37 from docutils.core import Publisher
38 from docutils.utils import SystemMessage, Reporter, new_reporter, new_document
39 from docutils.frontend import OptionParser, make_paths_absolute
40 from docutils.transforms import Transform
42 from treediff import TreeMatcher, HashableNodeImpl
44 ###############################################################################
45 ###############################################################################
46 # Command line specification
48 description = ("""Generates a structural diff from two reStructuredText input
49 documents and produces an annotated result. """)
51 writerOption = 'writer'
52 writerDefault = 'xml'
53 writerArgRE1 = '^--' + writerOption + '=' + '(.*)$'
55 settings_spec = (
56 'rstdiff options',
57 None,
58 (('Select writer to write output with (default "xml").',
59 ['--' + writerOption],
60 {}),
64 settings_defaults = {'output_encoding_error_handler': 'xmlcharrefreplace',
65 writerOption: writerDefault}
67 config_section = 'rstdiff'
69 usage = '%prog [options]... <old> [<new> [<output>]]'
71 ###############################################################################
72 # Classes for three argument command lines
74 class Publisher3Args(Publisher):
76 def setup_option_parser(self, usage=None, description=None,
77 settings_spec=None, config_section=None,
78 **defaults):
79 if config_section:
80 if not settings_spec:
81 settings_spec = SettingsSpec()
82 settings_spec.config_section = config_section
83 parts = config_section.split()
84 if len(parts) > 1 and parts[-1] == 'application':
85 settings_spec.config_section_dependencies = ['applications']
86 #@@@ Add self.source & self.destination to components in future?
87 option_parser = OptionParser3Args(
88 components=(self.parser, self.reader, self.writer, settings_spec),
89 defaults=defaults, read_config_files=1,
90 usage=usage, description=description)
91 return option_parser
93 class OptionParser3Args(OptionParser):
95 def check_values(self, values, args):
96 """Store positional arguments as runtime settings."""
97 values._old_source, values._new_source, values._destination = self.check_args(args)
98 make_paths_absolute(values.__dict__, self.relative_path_settings,
99 os.getcwd())
100 values._config_files = self.config_files
101 return values
103 def check_args(self, args):
104 old_source = new_source = destination = None
105 if not args:
106 self.error('At least 1 argument required.')
107 else:
108 old_source = args.pop(0)
109 if old_source == '-': # means stdin
110 old_source = None
111 if args:
112 new_source = args.pop(0)
113 if new_source == '-': # means stdin
114 new_source = None
115 if args:
116 destination = args.pop(0)
117 if destination == '-': # means stdout
118 destination = None
119 if args:
120 self.error('Maximum 3 arguments allowed.')
121 if old_source is None and new_source is None:
122 self.error('Old and new source may not both use stdin.')
123 if (old_source and old_source == destination
124 or new_source and new_source == destination):
125 self.error('Do not specify the same file for both source and '
126 'destination. It will clobber the source file.')
127 return old_source, new_source, destination
129 ###############################################################################
130 ###############################################################################
131 # Helpers
133 class Opcode(object):
134 """Encapsulates opcodes as returned by `TreeMatcher.get_opcodes()`"""
136 Replace = 'replace'
137 Delete = 'delete'
138 Insert = 'insert'
139 Equal = 'equal'
140 Descend = 'descend'
142 _tuple = None
144 def __init__(self, opcodeTuple):
145 """Initialize from a tuple returned by `TreeMatcher.get_opcodes()`"""
146 self._tuple = list(opcodeTuple)
148 def getCommand(self):
149 """Return the command."""
150 return self._tuple[0]
152 def getOldRange(self):
153 """Returns the range pertaining to an old list."""
154 return ( self._tuple[1], self._tuple[2], )
156 def getNewRange(self):
157 """Returns the range pertaining to a new list."""
158 return ( self._tuple[3], self._tuple[4], )
160 def getSubOpcodes(self):
161 """Return the sub-opcodes in case of `command` == 'descend' or
162 `None`."""
163 if self._tuple[0] != self.Descend:
164 return None
165 return self._tuple[5]
167 def resolveOpcode(self, oldList, newList):
168 """Resolves opcode pertaining to `oldList` and `newList`. Returns tuple
169 consisting of
171 command
172 Same as self.getCommand().
174 oldRange
175 The range of elements in `oldList` affected by the opcode.
177 newRange
178 The range of elements in `newList` affected by the opcode.
180 subOpcodes
181 Same as self.getSubOpcodes().
183 oldRange = self.getOldRange()
184 newRange = self.getNewRange()
185 return ( self.getCommand(), oldList[oldRange[0]:oldRange[1]],
186 newList[newRange[0]:newRange[1]], self.getSubOpcodes())
188 def setSubOpcodes(self, opcodes):
189 """Set the sub-opcodes to a new list."""
190 if self._tuple[0] != self.Descend:
191 raise TypeError("Can not set subopcodes of a %r opcode"
192 % ( self._tuple[0], ))
193 self._tuple[5] = opcodes
195 def setCommand(self, command):
196 """Set a new command adapting subopcodes."""
197 if self._tuple[0] == command:
198 return
199 self._tuple[0] = command
200 if command == self.Descend:
201 self._tuple[5] = [ ]
202 else:
203 self._tuple = self._tuple[0:5]
205 def setOldRange(self, range):
206 """Sets the range pertaining to an old list."""
207 ( self._tuple[1], self._tuple[2], ) = range
209 def setNewRange(self, range):
210 """Sets the range pertaining to a new list."""
211 ( self._tuple[3], self._tuple[4], ) = range
213 def asTuple(self):
214 """Return the opcode as a tuple."""
215 return tuple(self._tuple)
217 ###############################################################################
218 ###############################################################################
219 # Additional docutils stuff
221 ###############################################################################
222 # Node types
224 class White(nodes.Text):
225 """A piece of text containing only whitespace."""
227 tagname = '#white'
229 """A regular expression matching strings for this class and returning
230 them as the first match."""
231 # TODO Could be subject to an option
232 re = '(\\s+)'
234 class Word(nodes.Text):
235 """A piece of text containing exactly one word."""
237 tagname = '#word'
239 @staticmethod
240 def splitText(text):
241 """Splits text and returns a sequence of `Word` and `White`
242 objects. Returns an empty sequence for an empty `text`."""
244 subs = re.split(White.re, text.astext())
245 result = [ ]
246 if not subs:
247 return result
248 elif re.match(White.re, subs[0]):
249 ( current, next, ) = ( White, Word, )
250 else:
251 ( current, next, ) = ( Word, White, )
252 for sub in subs:
253 result.append(current(sub))
254 ( current, next, ) = ( next, current, )
255 return result
257 ###############################################################################
258 # Transformers
260 class Text2Words(Transform):
261 """Transforms a `Text` node into a sequence of `Word`/`White`."""
263 def apply(self):
264 self.document.walk(Text2WordsVisitor(self.document))
266 class Text2WordsVisitor(nodes.SparseNodeVisitor):
268 def visit_Text(self, text):
269 words = Word.splitText(text)
270 if not words:
271 # An empty text
272 words = [ White(''), ]
273 text.parent.replace(text, words)
275 class Words2Text(Transform):
276 """Transforms a sequence of `Word`/`White` into a `Text` node."""
278 def apply(self):
279 self.document.walk(Words2TextVisitor(self.document))
281 class Words2TextVisitor(nodes.SparseNodeVisitor):
283 def visit_Word(self, word):
284 parent = word.parent
285 # Find this node and the first node of the sequence it belongs to
286 first = None
287 for i in range(len(parent)):
288 if not isinstance(parent[i], nodes.Text):
289 first = None
290 elif first is None:
291 first = i
292 # ``parent.index(word)`` uses value equality - can not be
293 # used here to find `word`
294 if id(parent[i]) == id(word):
295 end = i + 1
296 break
297 else:
298 raise IndexError("Can not find %r in its parent" % ( word, ))
300 if (len(parent) > end
301 and isinstance(parent[end], nodes.Text)):
302 # The visitor processes following children even if they are
303 # deleted - so work for last node of a sequence
304 return
306 text = nodes.Text(reduce(lambda s, node: s + node.astext(),
307 parent[first:end], ""))
308 parent[first:end] = ( text, )
310 visit_White = visit_Word
312 ###############################################################################
313 ###############################################################################
314 # Hashable
316 class DocutilsDispatcher(HashableNodeImpl):
317 """Implements hashable for a docutils `Node` and supports construction."""
319 debug = False
321 def __init__(self):
322 super(self.__class__, self).__init__(nodes.Node)
324 def dispatchClass(self, function, node, *args):
325 """Dispatch a call of type `function` for the class of `node` using
326 arguments `node` and `args`. Default is to dispatch for imaginary class
327 "UNKNOWN"."""
328 pat = "%s_%%s" % ( function, )
329 try:
330 name = pat % ( node.__class__.__name__, )
331 method = getattr(self, name)
332 except AttributeError:
333 name = pat % ( 'UNKNOWN', )
334 method = getattr(self, name)
335 if self.debug:
336 print("*** %s(%s)"
337 % ( name, ", ".join([ arg.__class__.__name__
338 for arg in ( node, ) + args ]), ))
339 for arg in ( node, ) + args:
340 print(" %s" % ( arg, ))
341 result = method(node, *args)
342 if self.debug:
343 print(" %s" % ( result, ))
344 return result
346 ###########################################################################
347 ###########################################################################
348 # Implementation of abstract methods for `HashableNodeImpl`
350 def rootHash(self, node):
351 """Return a hash for the root only. Subclasses must override
352 this."""
353 return self.dispatchClass('rootHash', node)
355 def rootHash_UNKNOWN(self, node):
356 return hash(node.__class__)
358 def rootEq(self, node, other):
359 """Returns root equality of `node` and an `other` node. ``True`` if
360 the two nodes as roots are equal without considering their
361 children. This should be true if one node can be replaced by
362 the other and all changes can be represented without changing
363 the node itself. Subclasses must override this."""
364 # Only nodes of the same class can be equal - this assumption
365 # is used in many places
366 if node.__class__ != other.__class__:
367 return False
368 return self.dispatchClass('rootEq', node, other)
370 def rootEq_UNKNOWN(self, node, other):
371 # Unless we know better two roots of the same type are considered equal
372 return True
374 def childHash(self, node):
375 """Return a hash for the node as a child. Subclasses must override
376 this."""
377 return self.dispatchClass('childHash', node)
379 def childHash_UNKNOWN(self, node):
380 # By default compare as a child by comparing children
381 return self.childrenHash(node)
383 def childEq(self, node, other):
384 """Returns equality of `node` and an `other` node as children.
385 ``True`` if the child features of the two nodes are equal
386 without considering the root. Subclasses must override
387 this."""
388 # Only nodes of the same class can be equal - this assumption
389 # is used in many places
390 if node.__class__ != other.__class__:
391 return False
392 return self.dispatchClass('childEq', node, other)
394 def childEq_UNKNOWN(self, node, other):
395 # By default compare as a child by comparing children
396 return self.childrenEq(node, other)
398 def getChildren(self, node):
399 """Return the children of `node` as a list. Subclasses must override
400 this."""
401 return self.dispatchClass('getChildren', node)
403 def getChildren_UNKNOWN(self, node):
404 return node.children
406 ###########################################################################
407 ###########################################################################
408 # Merging
410 NewDelete = 'removed'
411 NewInsert = 'added'
412 NewReplaced = 'replaced'
413 NewReplacement = 'replacement'
415 def copyRoot(self, node):
416 """Copy `node` as root and return it."""
417 return self.dispatchClass('copyRoot', node)
419 def copyRoot_UNKNOWN(self, node):
420 return node.copy()
422 def addChild(self, root, child):
423 """Add `child` to `root`."""
424 return self.dispatchClass('addChild', root, child)
426 def addChild_UNKNOWN(self, root, child):
427 root.append(child)
429 def copyChild(self, node, newType):
430 """Copy `node` as child and return it. `newType` is ``None`` for an
431 unchanged child or the change type."""
432 return self.dispatchClass('copyChild', node, newType)
434 def copyChild_UNKNOWN(self, node, newType):
435 return self.setNewType(node.deepcopy(), newType)
437 def copyChildren(self, head, tail, root, newType):
438 """Return a range of new nodes copied from [ `head` ] + `tail` under
439 `root`. `tail` are all the same class as `head`. Nodes are
440 created approproate to type `newType`."""
441 return self.dispatchClass('copyChildren', head, tail, root, newType)
443 def copyChildren_UNKNOWN(self, head, tail, root, newType):
444 return [ self.copyChild(child, newType)
445 for child in [ head, ] + tail ]
447 def copyRange(self, root, children, newType):
448 """Return a range of new nodes copied from `children` under `root`.
449 Nodes are created appropriate to type `newType`."""
450 result = [ ]
451 begin = 0
452 while begin < len(children):
453 first = children[begin]
454 end = begin + 1
455 while end < len(children):
456 last = children[end]
457 if not(first.__class__ == last.__class__
458 or (isinstance(first, nodes.Text)
459 and isinstance(last, nodes.Text))):
460 break
461 end += 1
462 result.extend(self.copyChildren(first, children[begin + 1:end],
463 root, newType))
464 begin = end
465 return result
467 def mergeChildren(self, diffRoot, oldRoot, newRoot,
468 command, oldRange, newRange):
469 """Add children to `diffRoot` merging children `oldRange` / `newRange`
470 of `oldRoot` / `newRoot` by `command`."""
471 if command == Opcode.Equal:
472 for old in oldRange:
473 self.addChild(diffRoot, self.copyChild(old, None))
474 elif command == Opcode.Insert or command == Opcode.Delete:
475 if command == Opcode.Insert:
476 srcRoot = newRoot
477 srcRange = newRange
478 newType = self.NewInsert
479 else:
480 srcRoot = oldRoot
481 srcRange = oldRange
482 newType = self.NewDelete
483 for newChild in self.copyRange(srcRoot, srcRange, newType):
484 self.addChild(diffRoot, newChild)
485 elif command == Opcode.Replace:
486 for newChild in self.copyRange(oldRoot, oldRange,
487 self.NewReplaced):
488 self.addChild(diffRoot, newChild)
489 for newChild in self.copyRange(newRoot, newRange,
490 self.NewReplacement):
491 self.addChild(diffRoot, newChild)
492 else:
493 raise TypeError("Unhandled command %r" % ( command, ))
495 ###########################################################################
496 ###########################################################################
497 # Helpers
499 def setNewType(self, node, newType):
500 """Set a class on `node` for `newType` if set. Returns `node`."""
501 if newType:
502 node['classes'].append("change-%s" % ( newType, ))
503 return node
505 ###########################################################################
506 ###########################################################################
507 # Real comparison and merging
509 # The idea is like this: Each node has attributes which need to be
510 # compared as root and it has attributes which need to be compared
511 # as child. This is different for every node type.
513 # Similarly each node type may need special methods for cloning
514 # and merging.
516 ###########################################################################
517 # Text / Word / White
519 def rootHash_Text(self, node):
520 return hash(node.astext())
522 rootHash_Word = rootHash_Text
524 def rootHash_White(self, node):
525 # Whitespace compares all equal
526 return hash('')
528 def rootEq_Text(self, node, other):
529 return node.astext() == other.astext()
531 rootEq_Word = rootEq_Text
533 def rootEq_White(self, node, other):
534 # TODO Must behave different for places where whitespace
535 # differences are relevant
536 return True
538 # Text behaves the same as root or child
540 childHash_Text = rootHash_Text
541 childHash_Word = rootHash_Word
542 childHash_White = rootHash_White
544 childEq_Text = rootEq_Text
545 childEq_Word = rootEq_Word
546 childEq_White = rootEq_White
548 def copyChildren_Text(self, head, tail, root, newType):
549 inline = nodes.inline()
550 self.setNewType(inline, newType)
551 inline.extend([ head, ] + tail)
552 return [ inline, ]
554 # Sequences of Text are treated together
555 copyChildren_Word = copyChildren_Text
556 copyChildren_White = copyChildren_Text
558 ###############################################################################
559 ###############################################################################
560 # Main
562 def processCommandLine():
563 """Process command line and return a `Publisher`."""
564 # Determine writer here so options can be given normally
565 preWriter = writerDefault
566 for arg in sys.argv:
567 match = re.search(writerArgRE1, arg)
568 if match:
569 preWriter = match.group(1)
571 pub = Publisher3Args()
572 pub.set_reader('standalone', None, 'restructuredtext')
573 pub.set_writer(preWriter)
575 settingsSpec = SettingsSpec()
576 settingsSpec.settings_spec = settings_spec
577 settingsSpec.settings_defaults = settings_defaults
578 pub.process_command_line(usage=usage, description=description,
579 settings_spec=settingsSpec,
580 config_section=config_section)
581 if pub.settings.writer != preWriter:
582 new_reporter('<cmdline>',
583 pub.settings).severe("Internal error: Mismatch of pre-parsed (%r) and real (%r) writer"
584 % ( preWriter, pub.settings.writer, ))
585 pub.set_destination()
586 return pub
588 def readTree(pub, sourceName):
589 """Read and return a tree from `sourceName`."""
590 # Reset reader - just in case it keeps state from a previous invocation
591 pub.set_reader('standalone', None, 'restructuredtext')
592 pub.set_source(None, sourceName)
593 pub.document = None
594 pub.document = pub.reader.read(pub.source, pub.parser, pub.settings)
595 pub.apply_transforms()
596 return pub.document
598 def doDiff(hashableNodeImpl, oldTree, newTree):
599 """Create a difference from `oldTree` to `newTree` using
600 `hashableNodeImpl`. Returns the opcodes necessary to transform
601 `oldTree` to `newTree`."""
602 matcher = TreeMatcher(hashableNodeImpl, oldTree, newTree,
603 lambda node: isinstance(node, White))
604 return matcher.get_opcodes()
606 def buildDocument(oldTree, newTree, opcodes, settings):
607 """Returns a new document for the result of converting `oldTree` to
608 `newTree` using `opcodes` as description of changes."""
609 if (not isinstance(oldTree, docutils.nodes.document)
610 or not isinstance(newTree, docutils.nodes.document)):
611 raise TypeError("Roots of trees must be documents")
612 return new_document(u"%s => %s"
613 % ( settings._old_source, settings._new_source, ),
614 settings)
616 def buildTree(dispatcher, diffRoot, opcodes, oldRoot, newRoot):
617 """Adds a new sub-tree under `diffRoot` converting children of
618 `oldRoot` to `newRoot` using `opcodes`."""
619 oldChildren = dispatcher.getChildren(oldRoot)
620 newChildren = dispatcher.getChildren(newRoot)
621 for opcode in opcodes:
622 ( command, oldRange, newRange,
623 subOpcodes, ) = Opcode(opcode).resolveOpcode(oldChildren, newChildren)
624 if command == Opcode.Descend:
625 child = dispatcher.copyRoot(oldRange[0])
626 dispatcher.addChild(diffRoot, child)
627 buildTree(dispatcher, child,
628 subOpcodes, oldRange[0], newRange[0])
629 else:
630 dispatcher.mergeChildren(diffRoot, oldRoot, newRoot,
631 command, oldRange, newRange)
633 def cleanOpcodes(opcodes):
634 """Replace some nasty results in `opcodes` by cleaner versions."""
635 for i in range(len(opcodes)):
636 opcode = Opcode(opcodes[i])
637 subOpcodes = opcode.getSubOpcodes()
638 if not subOpcodes:
639 # Nothing to clean for flat opcodes
640 continue
642 cleanOpcodes(subOpcodes)
643 j = 1
644 while j < len(subOpcodes):
645 prev = Opcode(subOpcodes[j - 1])
646 this = Opcode(subOpcodes[j])
647 if (this.getCommand() != Opcode.Descend
648 and prev.getCommand() == this.getCommand()):
649 # Merge adjacing opcodes of same type
650 prevOld = prev.getOldRange()
651 prevNew = prev.getNewRange()
652 thisOld = this.getOldRange()
653 thisNew = this.getNewRange()
654 prev.setOldRange(( prevOld[0], thisOld[1], ))
655 prev.setNewRange(( prevNew[0], thisNew[1], ))
656 subOpcodes[j - 1:j + 1] = [ prev.asTuple(), ]
657 else:
658 j += 1
659 opcode.setSubOpcodes(subOpcodes)
660 if len(subOpcodes) == 1:
661 subOpcode = Opcode(subOpcodes[0])
662 if subOpcode.getCommand() != Opcode.Descend:
663 # Propagate 1-element sequences up
664 opcode.setCommand(subOpcode.getCommand())
665 opcodes[i] = opcode.asTuple()
667 def createDiff(oldTree, newTree):
668 """Create and return a diff document from `oldTree` to `newTree`."""
669 dispatcher = DocutilsDispatcher()
670 #dispatcher.debug = True
671 opcodes = doDiff(dispatcher, oldTree, newTree)
672 if dispatcher.debug:
673 from pprint import pprint
674 print(oldTree.asdom().toprettyxml())
675 print(newTree.asdom().toprettyxml())
676 pprint(opcodes, sys.stdout, 2, 40, None)
677 print("^^^ Before cleaning vvv After cleaning")
678 cleanOpcodes(opcodes)
679 if dispatcher.debug:
680 from pprint import pprint
681 pprint(opcodes, sys.stdout, 2, 40, None)
682 if len(opcodes) != 1:
683 raise TypeError("Don't how to merge documents which are not rootEq")
684 opcode = Opcode(opcodes[0])
685 if opcode.getCommand() not in ( Opcode.Descend, Opcode.Equal, ):
686 raise TypeError("Don't how to merge top level opcode of type %r"
687 % ( opcode.getCommand(), ))
689 diffDoc = buildDocument(oldTree, newTree, opcodes, pub.settings)
690 if opcode.getCommand() == Opcode.Equal:
691 # TODO Equality should be reported somehow
692 diffDoc.extend([ child.deepcopy()
693 for child in newTree.children ])
694 else:
695 buildTree(dispatcher, diffDoc, opcode.getSubOpcodes(), oldTree, newTree)
696 return diffDoc
698 if __name__ == '__main__':
699 pub = processCommandLine()
701 oldTree = readTree(pub, pub.settings._old_source)
702 newTree = readTree(pub, pub.settings._new_source)
704 Text2Words(oldTree).apply()
705 Text2Words(newTree).apply()
707 diffDoc = createDiff(oldTree, newTree)
708 Words2Text(diffDoc).apply()
710 pub.writer.write(diffDoc, pub.destination)
711 pub.writer.assemble_parts()