Implement <xm:RemoveAttributes/>
[xmlmerge.git] / xmlmerge.py
blobdc428e18b2bbdcaf56ab3cce5502b2c6e0d99521
1 #!/usr/bin/env python
2 # -*- coding: utf-8 -*-
4 # Copyright 2008,2009 Felix Rabe <public@felixrabe.net>
7 # This file is part of XML Merge.
9 # XML Merge is free software: you can redistribute it and/or modify it
10 # under the terms of the GNU Lesser General Public License as published by
11 # the Free Software Foundation, either version 3 of the License, or (at
12 # your option) any later version.
14 # XML Merge is distributed in the hope that it will be useful, but
15 # WITHOUT ANY WARRANTY; without even the implied warranty of
16 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17 # GNU Lesser General Public License for more details.
19 # You should have received a copy of the GNU Lesser General Public License
20 # along with XML Merge. If not, see <http://www.gnu.org/licenses/>.
23 # Developed (i.e. tested) using Python 2.6.4 and lxml 2.2.2.
25 # TODO: What if an attribute should include the '{' or '}' chars?
27 """
28 The purpose of XML Merge is to preprocess any kind of XML file with great
29 flexibility.
31 XML Merge performs (among other things) recursive XML file inclusion and
32 XML element and attribute modification.
34 XML Merge is a Python module. It is normally invoked as a program from the
35 command line, but can equally well be used from within another Python
36 program or module.
37 """
39 ## IMPORTS AND CONSTANTS
41 import copy
42 import itertools
43 import optparse
44 import os
45 import re
46 import sys
47 import textwrap
49 import lxml.etree as ET
51 # Namespace mapping (can be directly used for lxml nsmap arguments):
52 xmns = {"xm": "urn:felixrabe:xmlns:xmlmerge:preprocess",
53 "xmt": "urn:felixrabe:xmlns:xmlmerge:inctrace"}
56 ## COMMAND LINE OPTION PARSING
58 class OptionParser(optparse.OptionParser):
60 def __init__(self, *a, **kw):
61 optparse.OptionParser.__init__(self, *a, **kw)
62 self.add_option("-i", "--input",
63 help=("(REQUIRED) input XML file"))
64 self.add_option("-o", "--output",
65 help=("output XML file (.out.xml if not given)"))
66 self.add_option("-s", "--xml-schema",
67 help=("XML Schema (.xsd) to validate output " +
68 "against"))
69 self.add_option("-r", "--reference",
70 help=("reference XML file to compare output " +
71 "against"))
72 self.add_option("-d", "--html-diff", action="store_true",
73 help=("only with -r; if output and reference " +
74 "differ, produce a HTML file showing the " +
75 "differences"))
76 self.add_option("-t", "--trace-includes", action="store_true",
77 help=("add tracing information to included " +
78 "XML fragments"))
79 self.add_option("-v", "--verbose", action="store_const",
80 dest="verbose", const=3,
81 help=("show debugging messages"))
82 self.add_option("-q", "--quiet", action="store_const",
83 dest="verbose", const=1,
84 help=("only show error messages"))
85 self.set_defaults(verbose=2)
87 # Explanation: levels of verbosity
88 # --quiet -> self.verbose == 1 # only show error messages
89 # -> self.verbose == 2 # no verbosity option given
90 # --verbose -> self.verbose == 3 # show debugging messages
93 def parse_command_line(argv):
94 """
95 parse_command_line(argv) -> optparse.Values
97 Parse argv and return an optparse.Values object containing the options.
99 This function performs all the necessary checks and conversions to make
100 sure all necessary options are given, and that all options are
101 available in a normalized format.
103 It also tries to create the containing directory for the output file if
104 it does not exist already.
106 # Parse options using OptionParser:
107 option_parser = OptionParser()
108 options, args = option_parser.parse_args(argv[1:])
110 # Make sure only options, and no other arguments, are passed on the
111 # command line:
112 try:
113 assert args == []
114 assert options.input is not None
115 except:
116 option_parser.error("Error: invalid argument list")
118 # If the output option has been omitted, build the output filename from
119 # the input filename, resulting in the file extension ".out.xml":
120 if options.output is None:
121 if options.input.lower().endswith(".xml"):
122 options.output = options.input[:-4] + ".out.xml"
123 else:
124 options.output = options.input + ".out.xml"
126 # Convert all filename options to normalized absolutized pathnames:
127 for n in "input output reference".split():
128 if getattr(options, n) is None: continue # if "-r" was not given
129 setattr(options, n, os.path.abspath(getattr(options, n)))
131 # When --verbose, print all filename options:
132 if options.verbose >= 3:
133 print "Input: %s" % options.input
134 print "Output: %s" % options.output
135 print "Reference: %s" % options.reference
137 # Make sure there is a directory where the output XML file should go:
138 try:
139 os.makedirs(os.path.dirname(options.output))
140 except:
141 pass # fail later if there still is no output directory now
143 return options
146 ## XML PROCESSING AND COMPARISON
148 def read_input_file(input_filename):
150 read_input_file(input_filename) -> ET._Element
152 Read the input file, and return the corresponding XML Element object,
153 the element tree root.
155 input_xml = ET.parse(input_filename).getroot()
156 return input_xml
158 def postprocess_xml(output_xml):
160 postprocess_xml(output_xml) -> ET._Element
162 Remove unnecessary namespace declarations and whitespace. Returns a
163 modified copy of output_xml. The argument may be modified by calling
164 this function.
166 # Remove unused namespace declarations:
167 # (http://codespeak.net/pipermail/lxml-dev/2009-September/004888.html)
168 ns_root = ET.Element("NS_ROOT", nsmap=xmns)
169 ns_root.append(output_xml)
170 ns_root.remove(output_xml)
171 # If you don't perform this copy, each output_xml element's
172 # getroottree() will report the temporary tree containing the empty
173 # NS_ROOT element. This is not a hack, this is about how lxml works.
174 output_xml = ET.ElementTree(copy.copy(output_xml)).getroot()
176 # Make pretty-printing work by removing unnecessary whitespace:
177 for el in output_xml.iter():
178 if el.text and not el.text.strip():
179 el.text = None
180 if el.tail and not el.tail.strip():
181 el.tail = None
183 return output_xml
185 def write_output_file(output_xml, output_filename):
187 Write the output XML Element to the specified output filename.
189 output_xmltree = output_xml.getroottree()
190 output_xmltree.write(output_filename, pretty_print=True,
191 xml_declaration=True, encoding="utf-8")
193 def read_xml_schema_file(xml_schema_filename):
195 read_xml_schema_file(xml_schema_filename) -> ET.XMLSchema
197 Read the XML Schema file, and return the corresponding XML Schema
198 object.
200 xml_schema_xmltree = ET.parse(xml_schema_filename)
201 xml_schema = ET.XMLSchema(xml_schema_xmltree)
202 return xml_schema
204 def match_against_schema(options, output_xml, xml_schema):
206 match_against_schema(options, output_xml, xml_schema) -> bool
208 Validate output against XML Schema.
210 The result is True if the output XML Element (tree) matches the XML
211 Schema, otherwise the result is False.
213 is_valid = xml_schema.validate(output_xml.getroottree())
214 if options.verbose >= 2:
215 if is_valid:
216 print "Output matches XML Schema."
217 else:
218 print "Output invalid according to XML Schema."
219 print xml_schema.error_log.last_error
220 return is_valid
222 def match_against_reference(options, output_xml):
224 match_against_reference(options, output_xml) -> bool
226 Compare the output string (read from file options.output) to the
227 reference string (read from options.reference). If they are not the
228 same (bytewise), and if options.html_diff is True, create an HTML file
229 showing the differences.
231 The result is True if output and reference are the same (bytewise),
232 otherwise the result is False.
234 reference_filename = options.reference
235 output_filename = options.output
236 do_html_diff = options.html_diff
238 reference_str = file(reference_filename, "rb").read()
239 output_str = file(output_filename, "rb").read()
240 is_valid = (reference_str == output_str)
241 if options.verbose >= 2:
242 if is_valid:
243 print "Output matches reference."
244 elif not do_html_diff:
245 print "Output and reference differ."
246 if do_html_diff and not is_valid:
247 html_filename = "%s.diff.html" % output_filename
248 if options.verbose >= 2:
249 print ("Output and reference differ - " +
250 "generating '%s'..." % html_filename)
251 create_reference_diff_html(html_filename, reference_str,
252 output_str)
253 return is_valid
255 def create_reference_diff_html(html_filename, reference_str, output_str):
257 Create an HTML file (created at html_filename) showing the differrences
258 between the reference string and the output string side-by-side.
260 reference_lines = reference_str.splitlines()
261 output_lines = output_str .splitlines()
263 import difflib
264 html_diff = difflib.HtmlDiff(wrapcolumn=75)
265 html_str = html_diff.make_file(reference_lines, output_lines,
266 "Reference", "Output")
267 file(html_filename, "w").write(html_str)
270 ## XML PREPROCESS CLASS
272 class XMLPreprocess(object):
274 Use:
276 >>> proc = XMLPreprocess()
277 >>> output_xml = proc(options, input_xml) # input_xml may change
280 def __init__(self):
281 super(XMLPreprocess, self).__init__()
282 self._namespace_stack = [{}]
284 def __call__(self, xml_element, namespace=None,
285 trace_includes=False, xml_filename=None):
287 XMLPreprocess()(...)
289 Preprocess the input XML Element, xml_element. The element tree of
290 xml_element will be modified in-place.
292 The namespace given should be a dict that can be used as a Python
293 namespace. This namespace will be used in XML attribute
294 substitution.
296 If trace_includes is True, the output will contain tags that
297 surround included sections of the file. The xml_filename argument
298 is then required.
300 Processing tags will recursively call this method (__call__) for
301 preprocessing the included file and for recursive inclusion.
303 print "Processing", xml_element.tag
304 if namespace is not None:
305 self._namespace_stack.append(namespace)
306 self.namespace = self._namespace_stack[-1]
307 self.trace_includes = trace_includes
308 self.xml_filename = xml_filename
310 ns = "{%s}" % xmns["xm"]
311 len_ns = len(ns)
313 # Evaluate Python expressions in the attributes of xml_element:
314 for attr_name, attr_value in xml_element.items(): # attr map
315 v = self._eval_substitution(attr_value, self.namespace)
316 xml_element.set(attr_name, v)
318 # If xml_element has xmns["xm"] as its namespace, proceed with the
319 # appropriate method of this class:
320 if xml_element.nsmap.get(xml_element.prefix) == xmns["xm"]:
321 tag = xml_element.tag[len_ns:] # just the tag without namespc
322 method = "_xm_" + tag.lower() # tolerate any case
323 if not hasattr(self, method):
324 raise Exception, "cannot process <xm:%s/>" % tag
325 getattr(self, method)(xml_element) # call the method
326 xml_element.getparent().remove(xml_element)
328 # If not, recurse:
329 else:
330 self._recurse_into(xml_element)
332 self._namespace_stack.pop()
333 self.namespace = None
334 return None
336 def _recurse_into(self, xml_element, namespace=None):
337 if namespace is not None:
338 self._namespace_stack.append(namespace)
339 for xml_sub_element in xml_element.xpath("*"):
340 self(xml_sub_element, None,
341 self.trace_includes, self.xml_filename)
342 if namespace is not None:
343 self.namespace = self._namespace_stack.pop()
345 _eval_substitution_regex = re.compile(r"\{(.*?)\}")
347 def _eval_substitution(self, attr_value, namespace):
349 Evaluate Python expressions within strings.
351 Internal method to perform substitution of Python expressions
352 within attribute values, {x} -> str(eval(x)). Example:
354 >>> self._attr_substitution("3 + 5 = {3 + 5} in Python", {})
355 '3 + 5 = 8 in Python'
357 Multiple Python expressions in one string are supported as well.
359 new_a_value = [] # faster than always concatenating strings
360 last_index = 0
361 for match in self._eval_substitution_regex.finditer(attr_value):
362 new_a_value.append(attr_value[last_index:match.start()])
363 result = str(eval(match.group(1), namespace))
364 new_a_value.append(result)
365 last_index = match.end()
366 new_a_value.append(attr_value[last_index:])
367 return "".join(new_a_value)
369 def _xm_addelements(self, xml_element):
371 Add subelements to, before, or after the element selected by XPath
372 (@to, @before or @after).
374 to = xml_element.get("to")
375 before = xml_element.get("before")
376 after = xml_element.get("after")
377 assert sum((to is None, before is None, after is None)) == 2
378 select = to or before or after
380 def _xm_block(self, xml_element):
382 Create a scope to contain visibility of newly assigned Python
383 variables. This works the same way that Python itself scopes
384 variables, i.e. by creating a shallow copy of the Python namespace.
385 E.g. assignments to list items will be visible to outside scopes!
387 self._recurse_into(xml_element, self.namespace.copy())
388 for xml_sub_element in xml_element[::-1]:
389 xml_element.addnext(xml_sub_element)
391 def _xm_comment(self, xml_element):
393 A comment that is removed by XML Merge.
395 pass # that's it
397 def _xm_include(self, xml_element):
399 Include from the specified file (@file) the elements selected by
400 XPath (@select).
403 def _xm_loop(self, xml_element):
405 Loop over a range of integer values.
407 The first attribute is evaluated as the loop counter. Example:
409 i="range(5, 9)" => iterates with i being 5, 6, 7, 8
411 WARNING: The loop counter attribute, as well as all substitutions
412 in subelement attributes (XPath ".//@*": "...{foo_bar}...") will
413 (wholly or partially) be evaluated as Python expressions using
414 eval().
416 # Get the loop counter name and list:
417 loop_counter_name = xml_element.keys()[0]
418 loop_counter_list = eval(xml_element.get(loop_counter_name),
419 self.namespace)
421 # Loop:
422 addnext_to_node = xml_element # for new elements
423 for loop_counter_value in loop_counter_list:
424 pass
426 def _xm_pythonexec(self, xml_element):
428 Execute Python code in the current namespace.
430 'self' and 'xml_element' are supplied temporarily. They are added
431 to the current namespace before the 'exec' statement, and removed
432 again afterwards.
434 code = textwrap.dedent(xml_element.text).strip()
435 self.namespace["self"] = self
436 self.namespace["xml_element"] = xml_element
437 exec code in self.namespace
438 del self.namespace["self"], self.namespace["xml_element"]
440 def _xm_removeattributes(self, xml_element):
442 Remove the attributes (@name) from the (zero or more) elements
443 selected by XPath (@select).
445 It is not considered an error if an attribute cannot be found on a
446 selected element.
448 attr_name = xml_element.get("name")
449 select_xpath = xml_element.get("select")
450 for xml_element_selected in xml_element.xpath(select_xpath):
451 # Can't find another way to remove an attribute than by using
452 # 'attrib':
453 attrib = xml_element_selected.attrib
454 if attr_name in attrib:
455 del xml_element_selected.attrib[attr_name]
457 def _xm_removeelements(self, xml_element):
459 Remove (zero or more) elements selected by XPath (@select).
462 def _xm_setattribute(self, xml_element):
464 Assign the value (@value) to the attribute (@name) of the element
465 selected by XPath (@select).
467 Example:
468 <Object index="0x1234"/>
469 <xm:SetAttribute name="otherattr" value="hallo"/>
471 Leads to:
472 <Object index="0x1234" otherattr="hello"/>
475 def _xm_var(self, xml_element):
477 Set a variable.
479 ns = self.namespace
480 for attr_name, attr_value in xml_element.items(): # attr map
481 ns[attr_name] = eval(attr_value, ns, ns)
484 ## MAIN FUNCTION
486 def main(argv):
488 main(argv) -> int
490 Process input to produce output according to the command line options.
492 After the XML Merge Manual, this is the first piece of the code a new
493 developer will read. Keep this code as simple as possible if you change
494 it in any way.
496 These are all possible exit status codes returned or raised (using
497 SystemExit) by main or the functions it calls:
498 - On success, and if all requested validations (-s, -r) match:
499 return 0
500 - On error, e.g. wrong options (see parse_command_line()):
501 return 1
502 - On mismatch (either XML Schema (-s) or reference (-r)):
503 return mismatch_bitmap # see end of main()
504 - To aid understanding the bitmap: If N matching functions are
505 provided, and all are requested and all fail to match the output
506 file:
507 return (2 ** N - 1) * 2 # mismatch_bitmap
509 # Parse command line to get options:
510 options = parse_command_line(argv)
512 # Input file => preprocessing => output file:
513 xml = read_input_file(options.input)
514 proc = XMLPreprocess()
515 proc(xml, trace_includes=options.trace_includes,
516 xml_filename=options.input)
517 xml = postprocess_xml(xml)
518 write_output_file(xml, options.output)
520 # If -s: Compare output to XML Schema file:
521 matches_schema = True # False means: match requested and negative
522 if options.xml_schema is not None:
523 xml_schema = read_xml_schema_file(options.xml_schema)
524 matches_schema = match_against_schema(options, xml, xml_schema)
526 # If -r: Compare output to reference:
527 matches_reference = True # False means: match requested and negative
528 if options.reference is not None:
529 matches_reference = match_against_reference(options, xml)
531 # Calculate and return the mismatch bitmap:
532 mismatch_bitmap = 0
533 mismatch_bitmap |= int(not matches_schema) << 1 # 2 on mismatch
534 mismatch_bitmap |= int(not matches_reference) << 2 # 4 on mismatch
535 return mismatch_bitmap
538 if __name__ == "__main__":
539 sys.exit(main(sys.argv))