Use tag URI instead of URN (which was used without registration) for the XML namespace
[xmlmerge.git] / xmlmerge.py
blob4a0ee86ae7c660459cd28cc6ad43c0a4964bf927
1 #!/usr/bin/env python
2 # -*- coding: utf-8 -*-
4 # XML Merge 2.0.1.git
6 # Copyright 2008,2009 Felix Rabe <public@felixrabe.net>
8 # The main() function (search "def main") is a good starting point for
9 # understanding the code.
12 # This file is part of XML Merge.
14 # XML Merge is free software: you can redistribute it and/or modify it
15 # under the terms of the GNU Lesser General Public License as published by
16 # the Free Software Foundation, either version 3 of the License, or (at
17 # your option) any later version.
19 # XML Merge is distributed in the hope that it will be useful, but
20 # WITHOUT ANY WARRANTY; without even the implied warranty of
21 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
22 # GNU Lesser General Public License for more details.
24 # You should have received a copy of the GNU Lesser General Public License
25 # along with XML Merge. If not, see <http://www.gnu.org/licenses/>.
28 # Developed (i.e. tested) mainly on Win32 using Python 2.6.4 + lxml 2.2.2,
29 # and to a lesser extent also with Python 2.5.4 + lxml 2.1.1.
31 """
32 The purpose of XML Merge is to preprocess any kind of XML file with great
33 flexibility.
35 XML Merge performs (among other things) recursive XML file inclusion and
36 XML element and attribute modification.
38 XML Merge is a Python module. It is normally invoked as a program from the
39 command line, but can equally well be used from within another Python
40 program or module.
41 """
43 __version_info__ = (2, 0, 99, "git")
44 __version__ = ".".join(str(n) for n in __version_info__[:2])
46 ## IMPORTS AND CONSTANTS
48 import copy
49 import itertools
50 import optparse
51 import os
52 import re
53 import sys
54 import textwrap
56 import lxml.etree as ET
58 # Namespace mapping (can be directly used for lxml nsmap arguments):
59 xmns = {"xm": "tag:felixrabe.net,2011:xmlns:xmlmerge:preprocess",
60 "xmt": "tag:felixrabe.net,2011:xmlns:xmlmerge:inctrace"}
63 ## COMMAND LINE OPTION PARSING
65 class OptionParser(optparse.OptionParser):
67 def __init__(self, *a, **kw):
68 optparse.OptionParser.__init__(self, *a, **kw)
69 self.add_option("-i", "--input",
70 help=("(REQUIRED) input XML file"))
71 self.add_option("-o", "--output",
72 help=("output XML file (.out.xml if not given)"))
73 self.add_option("-s", "--xml-schema",
74 help=("XML Schema (.xsd) to validate output " +
75 "against"))
76 self.add_option("-r", "--reference",
77 help=("reference XML file to compare output " +
78 "against"))
79 self.add_option("-d", "--html-diff", action="store_true",
80 help=("only with -r; if output and reference " +
81 "differ, produce a HTML file showing the " +
82 "differences"))
83 self.add_option("-t", "--trace-includes", action="store_true",
84 help=("add tracing information to included " +
85 "XML fragments"))
86 self.add_option("-v", "--verbose", action="store_const",
87 dest="verbose", const=3,
88 help=("show debugging messages"))
89 self.add_option("-q", "--quiet", action="store_const",
90 dest="verbose", const=1,
91 help=("only show error messages"))
92 self.set_defaults(verbose=2)
94 # Explanation: levels of verbosity
95 # --quiet -> self.verbose == 1 # only show error messages
96 # -> self.verbose == 2 # no verbosity option given
97 # --verbose -> self.verbose == 3 # show debugging messages
99 def error(self, *a, **kw):
100 self.print_help()
101 return optparse.OptionParser.error(self, *a, **kw)
104 def parse_command_line(argv):
106 parse_command_line(argv) -> optparse.Values
108 Parse argv and return an optparse.Values object containing the options.
110 This function performs all the necessary checks and conversions to make
111 sure all necessary options are given. The resulting options are provided in
112 a normalized format.
114 It also tries to create the containing directory for the output file if
115 it does not exist already.
117 # Parse options using OptionParser:
118 option_parser = OptionParser()
119 options, args = option_parser.parse_args(argv[1:])
121 # Make sure only options, and no other arguments, are passed on the
122 # command line:
123 try:
124 assert args == []
125 assert options.input is not None
126 except:
127 option_parser.error("Error: invalid argument list")
129 # If the output option has been omitted, build the output filename from
130 # the input filename, resulting in the file extension ".out.xml":
131 if options.output is None:
132 if options.input.lower().endswith(".xml"):
133 options.output = options.input[:-4] + ".out.xml"
134 else:
135 options.output = options.input + ".out.xml"
137 # Convert all filename options to normalized absolutized pathnames:
138 for n in "input output reference".split():
139 if getattr(options, n) is None: continue # if "-r" was not given
140 setattr(options, n, os.path.abspath(getattr(options, n)))
142 # When --verbose, print all filename options:
143 if options.verbose >= 3:
144 print "Input: %s" % options.input
145 print "Output: %s" % options.output
146 print "Reference: %s" % options.reference
148 # Make sure there is a directory where the output XML file should go:
149 try:
150 os.makedirs(os.path.dirname(options.output))
151 except:
152 pass # fail later if there still is no output directory now
154 return options
157 ## XML PROCESSING AND COMPARISON
159 def read_input_file(input_filename):
161 read_input_file(input_filename) -> ET._Element
163 Read the input file, and return the corresponding XML Element object,
164 the element tree root.
166 input_xml = ET.parse(input_filename).getroot()
167 return input_xml
169 def postprocess_xml(output_xml):
171 postprocess_xml(output_xml) -> ET._Element
173 Remove unnecessary namespace declarations and whitespace. Returns a
174 modified copy of output_xml. The argument may be modified by calling
175 this function.
177 # Remove unused namespace declarations:
178 # (http://codespeak.net/pipermail/lxml-dev/2009-September/004888.html)
179 ns_root = ET.Element("NS_ROOT", nsmap=xmns)
180 ns_root.append(output_xml)
181 ns_root.remove(output_xml)
182 # If you don't perform this copy, each output_xml element's
183 # getroottree() will report the temporary tree containing the empty
184 # NS_ROOT element. This is not a hack, this is about how lxml works.
185 output_xml = ET.ElementTree(copy.copy(output_xml)).getroot()
187 # Make pretty-printing work by removing unnecessary whitespace:
188 for el in output_xml.iter():
189 if el.text and not el.text.strip():
190 el.text = None
191 if el.tail and not el.tail.strip():
192 el.tail = None
194 return output_xml
196 def write_output_file(output_xml, output_filename):
198 Write the output XML Element to the specified output filename.
200 output_xmltree = output_xml.getroottree()
201 output_xmltree.write(output_filename, pretty_print=True,
202 xml_declaration=True, encoding="utf-8")
204 def read_xml_schema_file(xml_schema_filename):
206 read_xml_schema_file(xml_schema_filename) -> ET.XMLSchema
208 Read the XML Schema file, and return the corresponding XML Schema
209 object.
211 xml_schema_xmltree = ET.parse(xml_schema_filename)
212 xml_schema = ET.XMLSchema(xml_schema_xmltree)
213 return xml_schema
215 def match_against_schema(options, output_xml):
217 match_against_schema(options, output_xml) -> bool
219 Validate output against XML Schema (file options.xml_schema).
221 The result is True if the output XML Element (tree) matches the XML
222 Schema, otherwise the result is False.
224 xml_schema = read_xml_schema_file(options.xml_schema)
225 is_valid = xml_schema.validate(output_xml.getroottree())
226 if options.verbose >= 2:
227 if is_valid:
228 print "Output matches XML Schema."
229 else:
230 print "Output invalid according to XML Schema."
231 print xml_schema.error_log.last_error
232 return is_valid
234 def match_against_reference(options, output_xml):
236 match_against_reference(options, output_xml) -> bool
238 Compare the output string (read from file options.output) to the
239 reference string (read from options.reference). If they are not the
240 same (bytewise), and if options.html_diff is True, create an HTML file
241 showing the differences.
243 The result is True if output and reference are the same (bytewise),
244 otherwise the result is False.
246 reference_filename = options.reference
247 output_filename = options.output
248 do_html_diff = options.html_diff
250 reference_str = file(reference_filename, "rb").read()
251 output_str = file(output_filename, "rb").read()
252 is_valid = (reference_str == output_str)
253 if options.verbose >= 2:
254 if is_valid:
255 print "Output matches reference."
256 elif not do_html_diff:
257 print "Output and reference differ."
258 if do_html_diff and not is_valid:
259 html_filename = "%s.diff.html" % output_filename
260 if options.verbose >= 2:
261 print ("Output and reference differ - " +
262 "generating '%s'..." % html_filename)
263 create_reference_diff_html(html_filename, reference_str,
264 output_str)
265 return is_valid
267 def create_reference_diff_html(html_filename, reference_str, output_str):
269 Create an HTML file (created at html_filename) showing the differrences
270 between the reference string and the output string side-by-side.
272 reference_lines = reference_str.splitlines()
273 output_lines = output_str .splitlines()
275 import difflib
276 html_diff = difflib.HtmlDiff(wrapcolumn=75)
277 html_str = html_diff.make_file(reference_lines, output_lines,
278 "Reference", "Output")
279 file(html_filename, "w").write(html_str)
282 ## VARIOUS FUNCTIONS
284 def print_xml_error(xml_element, code=None):
285 print >>sys.stderr, "*** XML ERROR ***"
286 tree = xml_element.getroottree()
287 print >>sys.stderr, "File URL:", tree.docinfo.URL
288 xpath = tree.getpath(xml_element)
289 print >>sys.stderr, "Line:", xml_element.sourceline, " XPath:", xpath
290 if code is not None:
291 print >>sys.stderr, "Offending Python code / expression:"
292 print >>sys.stderr, " %s" % code.replace("\n", "\n ")
295 _brace_substitution_regex = re.compile(r"\{(.*?)\}")
297 def brace_substitution(string, xml_element=None, namespace=None):
299 Evaluate Python expressions within strings.
301 This internal method substitutes Python expressions embedded in strings for
302 their evaluated (string) values, like {x} -> str(eval(x)). Example:
304 >>> self._eval_substitution("3 + 5 = {3 + 5} in Python")
305 '3 + 5 = 8 in Python'
307 Multiple Python expressions in one string are supported as well. Nested
308 Python expressions are not supported.
310 if namespace is None: namespace = {}
311 new_str = [] # faster than continuously concatenating strings
312 last_index = 0
313 for match in _brace_substitution_regex.finditer(string):
314 new_str.append(string[last_index:match.start()])
315 expression = match.group(1)
316 try:
317 result = str(eval(expression, namespace))
318 except:
319 if xml_element is not None:
320 print_xml_error(xml_element, code=expression)
321 print >>sys.stderr
322 raise
323 new_str.append(result)
324 last_index = match.end()
325 new_str.append(string[last_index:])
326 return "".join(new_str)
329 ## XML PREPROCESS CLASS
331 class XMLPreprocess(object):
333 Use:
335 >>> proc = XMLPreprocess()
336 >>> output_xml = proc(options, input_xml) # input_xml may change
339 def __init__(self, initial_namespace={}):
340 super(XMLPreprocess, self).__init__()
341 self._namespace_stack = [initial_namespace]
343 def __call__(self, xml_element, namespace=None,
344 trace_includes=False, xml_filename=None):
346 XMLPreprocess()(...)
348 Preprocess the input XML Element, xml_element. The element tree of
349 xml_element will be modified in-place.
351 The namespace given should be a dict that can be used as a Python
352 namespace. This namespace will be used in XML attribute
353 substitution.
355 If trace_includes is True, the output will contain tags that
356 surround included sections of the file. The xml_filename argument
357 is then required.
359 Processing tags will recursively call this method (__call__) for
360 preprocessing the included file and for recursive inclusion.
362 if namespace is not None:
363 self._namespace_stack.append(namespace)
364 self.namespace = self._namespace_stack[-1]
365 self.trace_includes = trace_includes
366 self.xml_filename = xml_filename
368 ns = "{%s}" % xmns["xm"]
369 len_ns = len(ns)
371 # Evaluate Python expressions in the attributes of xml_element:
372 for attr_name, attr_value in xml_element.items(): # attr map
373 v = brace_substitution(attr_value, xml_element, self.namespace)
374 xml_element.set(attr_name, v)
376 # If xml_element has xmns["xm"] as its namespace, proceed with the
377 # appropriate method of this class:
378 if xml_element.nsmap.get(xml_element.prefix) == xmns["xm"]:
379 tag = xml_element.tag[len_ns:] # just the tag without namespc
380 method = "_xm_" + tag.lower() # tolerate any case
381 if not hasattr(self, method):
382 raise Exception, "cannot process <xm:%s/>" % tag
383 getattr(self, method)(xml_element) # call the method
384 # Preserve tail text:
385 tail = xml_element.tail
386 if tail is not None:
387 prev = xml_element.getprevious()
388 parent = xml_element.getparent()
389 if prev is not None:
390 prev.tail = (prev.tail or "") + tail
391 else:
392 parent.text = (parent.text or "") + tail
393 xml_element.getparent().remove(xml_element)
395 # If not, recurse:
396 else:
397 self._recurse_into(xml_element)
399 return None
401 def _recurse_into(self, xml_element, namespace=None):
402 if namespace is not None:
403 self._namespace_stack.append(namespace)
404 for xml_sub_element in xml_element.xpath("*"):
405 self(xml_sub_element, None,
406 self.trace_includes, self.xml_filename)
407 if namespace is not None:
408 self._namespace_stack.pop()
409 self.namespace = self._namespace_stack[-1]
411 def _xm_addelements(self, xml_element):
413 Add subelements to, before, or after the element selected by XPath
414 (@to, @before or @after).
416 Exactly one of (@to, @before, @after) must be specified. And the
417 XPath expression must return exactly one element. These conditions
418 are checked by assertions and will raise an exception if not met.
420 to = xml_element.get("to")
421 before = xml_element.get("before")
422 after = xml_element.get("after")
424 assert sum((to is None, before is None, after is None)) == 2
425 select = to or before or after
427 selected_context_nodes = xml_element.xpath(select)
428 assert len(selected_context_nodes) == 1
430 context_node = selected_context_nodes[0]
431 replace_context_node = False
433 if to is not None:
434 f = "append"
435 if before is not None:
436 f = "addprevious"
437 if after is not None:
438 f = "addnext"
439 replace_context_node = True
441 for xml_sub_element in xml_element:
442 getattr(context_node, f)(xml_sub_element)
443 if replace_context_node:
444 context_node = xml_sub_element
446 def _xm_block(self, xml_element):
448 Create a scope to contain visibility of newly assigned Python
449 variables. This works the same way that Python itself scopes
450 variables, i.e. by creating a shallow copy of the Python namespace.
451 E.g. assignments to list items will be visible to outside scopes!
453 self._recurse_into(xml_element, self.namespace.copy())
454 for xml_sub_node in xml_element[::-1]: # get children reversed
455 xml_element.addnext(xml_sub_node)
457 def _xm_comment(self, xml_element):
459 A comment that is removed by XML Merge.
461 pass # that's it
463 def _xm_defaultvar(self, xml_element):
465 Set (zero or more) variables in the active Python namespace, if not
466 already set.
468 ns = self.namespace
469 for attr_name, attr_value in xml_element.items(): # attr map
470 if not attr_name in ns:
471 try:
472 ns[attr_name] = eval(attr_value, ns)
473 except:
474 print_xml_error(xml_element, code=attr_value)
475 print >>sys.stderr
476 raise
478 def _xm_include(self, xml_element):
480 Include from the specified file (@file) the elements selected by
481 XPath (@select) after preprocessing said file.
483 The @file attribute is the only required attribute.
485 Items can be imported from the included (and preprocessed) file's
486 Python namespace into the current file's namespace using the
487 @import attribute, which may either be a comma-separated list of
488 identifiers, or '*' to import the complete namespace.
490 Remaining attributes will be treated as variable assignments and
491 put in the Python namespace used for processing the included file.
493 attrib = xml_element.attrib
494 file_ = attrib.pop("file", None)
495 select = attrib.pop("select", None)
496 import_ = attrib.pop("import", None)
497 assert file_ is not None
498 remaining_attribs = dict(attrib.items())
500 # Load the to-be-included file:
501 p = os.path
503 xml_input_dirname = p.dirname(self.xml_filename)
504 xml_incl_filename = p.join(xml_input_dirname, file_)
505 xml_incl_filename = p.normpath(xml_incl_filename)
506 # Always use '/' for normalized tracing information:
507 xml_incl_filename = xml_incl_filename.replace("\\", "/")
509 xml_incl = ET.parse(xml_incl_filename).getroot()
511 # Build the initial namespace from a copy of the current namespace
512 # plus the remaining attributes of the <xm:Include/> element:
513 current_ns = self.namespace
514 initial_namespace = current_ns.copy()
515 for attr_name, attr_value in remaining_attribs.items(): # attr map
516 try:
517 initial_namespace[attr_name] = eval(attr_value, current_ns)
518 except:
519 print_xml_error(xml_element, code=attr_value)
520 print >>sys.stderr
521 raise
523 # Preprocess the to-be-included file:
524 proc = XMLPreprocess(initial_namespace=initial_namespace)
525 proc(xml_incl, trace_includes=self.trace_includes,
526 xml_filename=xml_incl_filename)
528 # Select elements to include:
529 included_elements = []
530 if select is not None:
531 included_elements = xml_incl.xpath(select)
533 # Include the elements:
534 context_node = xml_element
535 for inc_elem in included_elements:
536 context_node.addnext(inc_elem)
537 context_node = inc_elem
539 # Import from included namespace:
540 imported_namespace = {}
541 if import_ is not None:
542 import_ = [x.strip() for x in import_.split(",")]
543 if "*" in import_: # import all
544 imported_namespace = proc.namespace
545 else:
546 ns = proc.namespace
547 imported_namespace = dict((x, ns[x]) for x in import_)
548 self.namespace.update(imported_namespace)
550 def _xm_loop(self, xml_element):
552 Loop over a range of integer values.
554 The first attribute is evaluated as the loop counter. Example:
556 i="range(5, 9)" => iterates with i being 5, 6, 7, 8
558 WARNING: The loop counter attribute, as well as all substitutions
559 in subelement attributes (XPath ".//@*": "...{foo_bar}...") will
560 (wholly or partially) be evaluated as Python expressions using
561 eval().
563 # Get the loop counter name and list:
564 loop_counter_name = xml_element.keys()[0]
565 loop_counter_expr = xml_element.get(loop_counter_name)
566 try:
567 loop_counter_list = eval(loop_counter_expr, self.namespace)
568 except:
569 print_xml_error(xml_element, code=loop_counter_expr)
570 print >>sys.stderr
571 raise
573 # Loop:
574 context_node = xml_element # for new elements
575 for loop_counter_value in loop_counter_list:
576 self.namespace[loop_counter_name] = loop_counter_value
577 tailtext = xml_element.tail
578 xml_element.tail = None # xml_element regarded as document
579 # xml_element_copy = copy.copy(xml_element) # CRASH
580 # The following line is the workaround for the preceeding one:
581 xml_element_copy = ET.XML(ET.tostring(xml_element))
582 xml_element.addnext(xml_element_copy) # temporarily
583 xml_element.tail = xml_element_copy.tail = tailtext
584 self._recurse_into(xml_element_copy)
585 xml_element_copy.getparent().remove(xml_element_copy)
586 if xml_element_copy.text is not None:
587 if context_node.tail is None:
588 context_node.tail = u""
589 context_node.tail += xml_element_copy.text
590 for xml_sub_node in xml_element_copy[:]:
591 context_node.addnext(xml_sub_node)
592 context_node = xml_sub_node
594 def _xm_pythoncode(self, xml_element):
596 Execute Python code in the current namespace.
598 'self' and 'xml_element' are supplied temporarily. They are added
599 to the current namespace before the 'exec' statement, and removed
600 again afterwards.
602 code = textwrap.dedent(xml_element.text).strip()
603 self.namespace["self"] = self
604 self.namespace["xml_element"] = xml_element
605 try:
606 exec code in self.namespace
607 except:
608 print_xml_error(xml_element, code=code)
609 print >>sys.stderr
610 raise
611 del self.namespace["self"], self.namespace["xml_element"]
613 def _xm_removeattributes(self, xml_element):
615 Remove the attributes (@name) from the (zero or more) elements
616 selected by XPath (@from or @select).
618 It is not considered an error if an attribute cannot be found on a
619 selected element.
621 attr_name = xml_element.get("name")
622 select_xpath = xml_element.get("from") or xml_element.get("select")
623 for xml_element_selected in xml_element.xpath(select_xpath):
624 # Can't find another way to remove an attribute than by using
625 # 'attrib':
626 attrib = xml_element_selected.attrib
627 if attr_name in attrib:
628 del xml_element_selected.attrib[attr_name]
630 def _xm_removeelements(self, xml_element):
632 Remove (zero or more) elements selected by XPath (@select).
634 select = xml_element.get("select")
635 assert select is not None
636 elements = xml_element.xpath(select)
637 for el in elements:
638 el.getparent().remove(el)
640 def _xm_setattribute(self, xml_element):
642 Assign the value (@value) to the attribute (@name) of the element
643 selected by XPath (@of or @select).
645 Example:
646 <Object index="0x1234"/>
647 <xm:SetAttribute of="../Object" name="otherattr" value="hallo"/>
649 Leads to:
650 <Object index="0x1234" otherattr="hello"/>
652 select = xml_element.get("select", xml_element.get("of"))
653 name = xml_element.get("name")
654 value = xml_element.get("value")
655 assert sum((select is None, name is None, value is None)) == 0
656 elements = xml_element.xpath(select)
657 for el in elements:
658 el.set(name, value)
660 def _xm_text(self, xml_element):
662 Perform '{}' substitution on text.
664 text = xml_element.text
665 if text is None: return
666 tail = brace_substitution(text, xml_element, self.namespace)
667 tail += xml_element.tail or ""
668 xml_element.tail = tail
670 def _xm_var(self, xml_element):
672 Set (zero or more) variables in the active Python namespace.
674 ns = self.namespace
675 for attr_name, attr_value in xml_element.items(): # attr map
676 try:
677 ns[attr_name] = eval(attr_value, ns)
678 except:
679 print_xml_error(xml_element, code=attr_value)
680 print >>sys.stderr
681 raise
684 ## MAIN FUNCTION
686 def main(argv, **kargs):
688 main(argv, **kargs) -> int
690 Process the input file to produce an output file according to the command
691 line options, given in argv. These keyword arguments (**kargs) are
692 recognized:
694 initial_namespace
695 Gets passed on as the initial Python namespace to XMLPreprocess().
697 After the XML Merge Manual, the code of this function is the first part of
698 XML Merge any new developer should read. So keep this code as simple as
699 possible if you change it in any way.
701 These are all possible exit status codes returned or raised (using
702 SystemExit) by main or the functions it calls:
703 - On success, and if all requested validations (-s, -r) match:
704 return 0
705 - On error, e.g. wrong options (see parse_command_line()):
706 return 1
707 - On mismatch (either XML Schema (-s) or reference (-r)):
708 return mismatch_bitmap # see end of main()
709 - To aid understanding the bitmap: If N matching functions are
710 provided, and all are requested and all fail to match the output
711 file:
712 return (2 ** N - 1) * 2 # mismatch_bitmap
714 # Parse command line to get options:
715 options = parse_command_line(argv)
717 # Input file => preprocessing => output file:
718 xml = read_input_file(options.input)
719 proc = XMLPreprocess(**kargs)
720 proc(xml, trace_includes=options.trace_includes,
721 xml_filename=options.input)
722 xml = postprocess_xml(xml)
723 write_output_file(xml, options.output)
725 # If -s: Compare output to XML Schema file:
726 matches_schema = True # False means: match requested and negative
727 if options.xml_schema is not None:
728 matches_schema = match_against_schema(options, xml)
730 # If -r: Compare output to reference:
731 matches_reference = True # False means: match requested and negative
732 if options.reference is not None:
733 matches_reference = match_against_reference(options, xml)
735 # Calculate and return the mismatch bitmap:
736 mismatch_bitmap = 0
737 mismatch_bitmap |= int(not matches_schema) << 1 # 2 on mismatch
738 mismatch_bitmap |= int(not matches_reference) << 2 # 4 on mismatch
739 return mismatch_bitmap
742 if __name__ == "__main__":
743 sys.exit(main(sys.argv))