third_party/markdown/serializers.py

   1 # markdown is released under the BSD license
   2 # Copyright 2007, 2008 The Python Markdown Project (v. 1.7 and later)
   3 # Copyright 2004, 2005, 2006 Yuri Takhteyev (v. 0.2-1.6b)
   4 # Copyright 2004 Manfred Stienstra (the original version)
   5 #
   6 # All rights reserved.
   7 #
   8 # Redistribution and use in source and binary forms, with or without
   9 # modification, are permitted provided that the following conditions are met:
  10 #
  11 # *   Redistributions of source code must retain the above copyright
  12 #     notice, this list of conditions and the following disclaimer.
  13 # *   Redistributions in binary form must reproduce the above copyright
  14 #     notice, this list of conditions and the following disclaimer in the
  15 #     documentation and/or other materials provided with the distribution.
  16 # *   Neither the name of the <organization> nor the
  17 #     names of its contributors may be used to endorse or promote products
  18 #     derived from this software without specific prior written permission.
  19 #
  20 # THIS SOFTWARE IS PROVIDED BY THE PYTHON MARKDOWN PROJECT ''AS IS'' AND ANY
  21 # EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
  22 # WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
  23 # DISCLAIMED. IN NO EVENT SHALL ANY CONTRIBUTORS TO THE PYTHON MARKDOWN PROJECT
  24 # BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
  25 # CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
  26 # SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
  27 # INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
  28 # CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
  29 # ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
  30 # POSSIBILITY OF SUCH DAMAGE.
  31
  32
  33 # markdown/searializers.py
  34 #
  35 # Add x/html serialization to Elementree
  36 # Taken from ElementTree 1.3 preview with slight modifications
  37 #
  38 # Copyright (c) 1999-2007 by Fredrik Lundh.  All rights reserved.
  39 #
  40 # fredrik@pythonware.com
  41 # http://www.pythonware.com
  42 #
  43 # --------------------------------------------------------------------
  44 # The ElementTree toolkit is
  45 #
  46 # Copyright (c) 1999-2007 by Fredrik Lundh
  47 #
  48 # By obtaining, using, and/or copying this software and/or its
  49 # associated documentation, you agree that you have read, understood,
  50 # and will comply with the following terms and conditions:
  51 #
  52 # Permission to use, copy, modify, and distribute this software and
  53 # its associated documentation for any purpose and without fee is
  54 # hereby granted, provided that the above copyright notice appears in
  55 # all copies, and that both that copyright notice and this permission
  56 # notice appear in supporting documentation, and that the name of
  57 # Secret Labs AB or the author not be used in advertising or publicity
  58 # pertaining to distribution of the software without specific, written
  59 # prior permission.
  60 #
  61 # SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD
  62 # TO THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANT-
  63 # ABILITY AND FITNESS.  IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR
  64 # BE LIABLE FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY
  65 # DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
  66 # WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
  67 # ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
  68 # OF THIS SOFTWARE.
  69 # --------------------------------------------------------------------
  70
  71
  72 from __future__ import absolute_import
  73 from __future__ import unicode_literals
  74 from . import util
  75 ElementTree = util.etree.ElementTree
  76 QName = util.etree.QName
  77 if hasattr(util.etree, 'test_comment'):
  78     Comment = util.etree.test_comment
  79 else:
  80     Comment = util.etree.Comment
  81 PI = util.etree.PI
  82 ProcessingInstruction = util.etree.ProcessingInstruction
  83
  84 __all__ = ['to_html_string', 'to_xhtml_string']
  85
  86 HTML_EMPTY = ("area", "base", "basefont", "br", "col", "frame", "hr",
  87               "img", "input", "isindex", "link", "meta" "param")
  88
  89 try:
  90     HTML_EMPTY = set(HTML_EMPTY)
  91 except NameError:
  92     pass
  93
  94 _namespace_map = {
  95     # "well-known" namespace prefixes
  96     "http://www.w3.org/XML/1998/namespace": "xml",
  97     "http://www.w3.org/1999/xhtml": "html",
  98     "http://www.w3.org/1999/02/22-rdf-syntax-ns#": "rdf",
  99     "http://schemas.xmlsoap.org/wsdl/": "wsdl",
 100     # xml schema
 101     "http://www.w3.org/2001/XMLSchema": "xs",
 102     "http://www.w3.org/2001/XMLSchema-instance": "xsi",
 103     # dublic core
 104     "http://purl.org/dc/elements/1.1/": "dc",
 105 }
 106
 107
 108 def _raise_serialization_error(text):
 109     raise TypeError(
 110         "cannot serialize %r (type %s)" % (text, type(text).__name__)
 111         )
 112
 113 def _encode(text, encoding):
 114     try:
 115         return text.encode(encoding, "xmlcharrefreplace")
 116     except (TypeError, AttributeError):
 117         _raise_serialization_error(text)
 118
 119 def _escape_cdata(text):
 120     # escape character data
 121     try:
 122         # it's worth avoiding do-nothing calls for strings that are
 123         # shorter than 500 character, or so.  assume that's, by far,
 124         # the most common case in most applications.
 125         if "&" in text:
 126             text = text.replace("&", "&amp;")
 127         if "<" in text:
 128             text = text.replace("<", "&lt;")
 129         if ">" in text:
 130             text = text.replace(">", "&gt;")
 131         return text
 132     except (TypeError, AttributeError):
 133         _raise_serialization_error(text)
 134
 135
 136 def _escape_attrib(text):
 137     # escape attribute value
 138     try:
 139         if "&" in text:
 140             text = text.replace("&", "&amp;")
 141         if "<" in text:
 142             text = text.replace("<", "&lt;")
 143         if ">" in text:
 144             text = text.replace(">", "&gt;")
 145         if "\"" in text:
 146             text = text.replace("\"", "&quot;")
 147         if "\n" in text:
 148             text = text.replace("\n", "&#10;")
 149         return text
 150     except (TypeError, AttributeError):
 151         _raise_serialization_error(text)
 152
 153 def _escape_attrib_html(text):
 154     # escape attribute value
 155     try:
 156         if "&" in text:
 157             text = text.replace("&", "&amp;")
 158         if "<" in text:
 159             text = text.replace("<", "&lt;")
 160         if ">" in text:
 161             text = text.replace(">", "&gt;")
 162         if "\"" in text:
 163             text = text.replace("\"", "&quot;")
 164         return text
 165     except (TypeError, AttributeError):
 166         _raise_serialization_error(text)
 167
 168
 169 def _serialize_html(write, elem, qnames, namespaces, format):
 170     tag = elem.tag
 171     text = elem.text
 172     if tag is Comment:
 173         write("<!--%s-->" % _escape_cdata(text))
 174     elif tag is ProcessingInstruction:
 175         write("<?%s?>" % _escape_cdata(text))
 176     else:
 177         tag = qnames[tag]
 178         if tag is None:
 179             if text:
 180                 write(_escape_cdata(text))
 181             for e in elem:
 182                 _serialize_html(write, e, qnames, None, format)
 183         else:
 184             write("<" + tag)
 185             items = elem.items()
 186             if items or namespaces:
 187                 items.sort() # lexical order
 188                 for k, v in items:
 189                     if isinstance(k, QName):
 190                         k = k.text
 191                     if isinstance(v, QName):
 192                         v = qnames[v.text]
 193                     else:
 194                         v = _escape_attrib_html(v)
 195                     if qnames[k] == v and format == 'html':
 196                         # handle boolean attributes
 197                         write(" %s" % v)
 198                     else:
 199                         write(" %s=\"%s\"" % (qnames[k], v))
 200                 if namespaces:
 201                     items = namespaces.items()
 202                     items.sort(key=lambda x: x[1]) # sort on prefix
 203                     for v, k in items:
 204                         if k:
 205                             k = ":" + k
 206                         write(" xmlns%s=\"%s\"" % (k, _escape_attrib(v)))
 207             if format == "xhtml" and tag in HTML_EMPTY:
 208                 write(" />")
 209             else:
 210                 write(">")
 211                 tag = tag.lower()
 212                 if text:
 213                     if tag == "script" or tag == "style":
 214                         write(text)
 215                     else:
 216                         write(_escape_cdata(text))
 217                 for e in elem:
 218                     _serialize_html(write, e, qnames, None, format)
 219                 if tag not in HTML_EMPTY:
 220                     write("</" + tag + ">")
 221     if elem.tail:
 222         write(_escape_cdata(elem.tail))
 223
 224 def _write_html(root,
 225                 encoding=None,
 226                 default_namespace=None,
 227                 format="html"):
 228     assert root is not None
 229     data = []
 230     write = data.append
 231     qnames, namespaces = _namespaces(root, default_namespace)
 232     _serialize_html(write, root, qnames, namespaces, format)
 233     if encoding is None:
 234         return "".join(data)
 235     else:
 236         return _encode("".join(data))
 237
 238
 239 # --------------------------------------------------------------------
 240 # serialization support
 241
 242 def _namespaces(elem, default_namespace=None):
 243     # identify namespaces used in this tree
 244
 245     # maps qnames to *encoded* prefix:local names
 246     qnames = {None: None}
 247
 248     # maps uri:s to prefixes
 249     namespaces = {}
 250     if default_namespace:
 251         namespaces[default_namespace] = ""
 252
 253     def add_qname(qname):
 254         # calculate serialized qname representation
 255         try:
 256             if qname[:1] == "{":
 257                 uri, tag = qname[1:].split("}", 1)
 258                 prefix = namespaces.get(uri)
 259                 if prefix is None:
 260                     prefix = _namespace_map.get(uri)
 261                     if prefix is None:
 262                         prefix = "ns%d" % len(namespaces)
 263                     if prefix != "xml":
 264                         namespaces[uri] = prefix
 265                 if prefix:
 266                     qnames[qname] = "%s:%s" % (prefix, tag)
 267                 else:
 268                     qnames[qname] = tag # default element
 269             else:
 270                 if default_namespace:
 271                     raise ValueError(
 272                         "cannot use non-qualified names with "
 273                         "default_namespace option"
 274                         )
 275                 qnames[qname] = qname
 276         except TypeError:
 277             _raise_serialization_error(qname)
 278
 279     # populate qname and namespaces table
 280     try:
 281         iterate = elem.iter
 282     except AttributeError:
 283         iterate = elem.getiterator # cET compatibility
 284     for elem in iterate():
 285         tag = elem.tag
 286         if isinstance(tag, QName) and tag.text not in qnames:
 287             add_qname(tag.text)
 288         elif isinstance(tag, util.string_type):
 289             if tag not in qnames:
 290                 add_qname(tag)
 291         elif tag is not None and tag is not Comment and tag is not PI:
 292             _raise_serialization_error(tag)
 293         for key, value in elem.items():
 294             if isinstance(key, QName):
 295                 key = key.text
 296             if key not in qnames:
 297                 add_qname(key)
 298             if isinstance(value, QName) and value.text not in qnames:
 299                 add_qname(value.text)
 300         text = elem.text
 301         if isinstance(text, QName) and text.text not in qnames:
 302             add_qname(text.text)
 303     return qnames, namespaces
 304
 305 def to_html_string(element):
 306     return _write_html(ElementTree(element).getroot(), format="html")
 307
 308 def to_xhtml_string(element):
 309     return _write_html(ElementTree(element).getroot(), format="xhtml")