tools/compilers.py

   1 # CSS and HTML files pre-processor.                -*- coding: utf-8 -*-
   2 # Copyright 2016 by Michał Nazarewicz <mina86@mina86.com>
   3 #
   4 # This program is  free software: you can redistribute  it and/or modify
   5 # it under the  terms of the GNU General Public  License as published by
   6 # the Free Software Foundation, either version  3 of the License, or (at
   7 # your option) any later version.
   8 #
   9 # This program  is distributed in the  hope that it will  be useful, but
  10 # WITHOUT   ANY  WARRANTY;   without  even   the  implied   warranty  of
  11 # MERCHANTABILITY  or FITNESS  FOR A  PARTICULAR PURPOSE.   See the  GNU
  12 # General Public License for more details.
  13 #
  14 # You should  have received  a copy  of the  GNU General  Public License
  15 # along with this program.  If not, see <http://www.gnu.org/licenses/>.
  16 #
  17 # Unless required  by applicable law  or agreed to in  writing, software
  18 # distributed  under the  Apache License  is distributed  on an  "AS IS"
  19 # BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
  20 # implied.  See the  Apache License for the  specific language governing
  21 # permissions and limitations under the License.
  22
  23 import base64
  24 import os
  25 import re
  26 import sys
  27 import typing
  28
  29 sys.path.insert(1, os.path.abspath(os.path.join(os.path.dirname(__file__),
  30                                                 '..', '..', 'htmlmin')))
  31 import htmlmin.parser
  32
  33
  34 _MIMETYPES: typing.Dict[str, bytes] = {
  35     '.jpg': b'image/jpeg',
  36     '.png': b'image/png',
  37     '.svg': b'image/svg+xml',
  38 }
  39
  40 def _encode_base64(mime: bytes, data: bytes) -> bytes:
  41     data = base64.standard_b64encode(data)
  42     return b'data:%s;base64,%s' % (mime, data)
  43
  44 def _encode_string(mime: bytes, data: bytes, css: bool) -> bytes:
  45     chars = r'\0-\x1f\x80-\xff%#'
  46     if not css:
  47         # In HTML mode we don’t have to URL escape greater than sign, apostrophe
  48         # and quote in the string since those can be handled via HTML entities.
  49         # However, URL encoding is just three-character long while the entities
  50         # are at least four.
  51         if data.count(b"'") > data.count(b'"'):
  52             chars += r' >"'
  53         else:
  54             chars += r" >'"
  55
  56     str_data = re.sub('[{}]'.format(chars),
  57                       lambda m: '%%%02x' % ord(m.group(0)),
  58                       data.decode('utf-8').strip())
  59
  60     def fmt(data: str, quote: str = '') -> bytes:
  61         quote = quote.encode('ascii')
  62         return b'%sdata:%s,%s%s' % (quote, mime, data.encode('ascii'), quote)
  63
  64     if not css:
  65         return fmt(str_data)
  66
  67     str_data.replace('\n', '\\n')
  68     x, y = [fmt(str_data.replace(q, '\\' + q), q) for q in '\'"']
  69     return x if len(x) < len(y) else y
  70
  71 def _insert_data(src_dir: str, path: str, css: bool = False) -> bytes:
  72     _, ext = os.path.splitext(path)
  73     mime = _MIMETYPES[ext]
  74     data = open(os.path.join(src_dir, path), 'rb').read()
  75
  76     encoded = _encode_base64(mime, data)
  77     if ext != '.svg':
  78         return encoded
  79
  80     x = encoded
  81     y = _encode_string(mime, data, css)
  82     return x if len(x) < len(y) else y
  83
  84
  85 def _map_static(mappings: typing.Dict[str, str], path: str) -> str:
  86     return mappings.get(path, path)
  87
  88
  89 def process_css(data: bytes, src_dir: str,
  90                 mappings: typing.Dict[str, str]) -> bytes:
  91     data = re.sub(rb'DATA<([^<>]+)>',
  92                   lambda m: _insert_data(src_dir, m.group(1).decode('utf-8'),
  93                                          css=True),
  94                   data)
  95     data = re.sub(rb'/d/[-_a-zA-Z0-9.]*',
  96                   lambda m: _map_static(mappings, m.group(0).decode('utf-8')).encode('utf-8'),
  97                   data)
  98     return data
  99
 100 _Attributes = typing.Sequence[typing.Tuple[str, typing.Optional[str]]]
 101
 102 class HTMLMinParser(htmlmin.parser.HTMLMinParser):
 103
 104     @staticmethod
 105     def _minify_css(data):
 106         # CSS style, remove unnecessary spaces after punctuation marks.
 107         # This is very likely to break non-trivial rules.
 108         data = re.sub(r'\s+', ' ', data.strip())
 109         return re.sub(r'\s*([:;,{}])\s*', r'\1', data)
 110
 111     def __init__(self, *args: typing.Any, **kw: typing.Any):
 112         self._static_mappings = kw.pop('static_mappings', None)
 113         self._src_dir = kw.pop('src_dir', None)
 114         self._self = [[1, kw.pop('self_url', None)]]
 115         super().__init__(*args, **kw)
 116
 117     def handle_starttag(self, tag: str, attrs: _Attributes) -> None:
 118         self._transform_attrs(tag, attrs)
 119         self._self[-1][0] += 1
 120         super().handle_starttag(tag, attrs)
 121
 122     def handle_startendtag(self, tag: str, attrs: _Attributes) -> None:
 123         self._transform_attrs(tag, attrs)
 124         if not self._self[-1][0]:
 125             self._self.pop()
 126         super().handle_startendtag(tag, attrs)
 127
 128     def handle_endtag(self, tag):
 129         self._self[-1][0] -= 1
 130         if not self._self[-1][0]:
 131             self._self.pop()
 132         super().handle_endtag(tag)
 133
 134     def _transform_attrs(self, tag: str, attrs: _Attributes) -> None:
 135         i = 0
 136         while i < len(attrs):
 137             attr, value = attrs[i]
 138             if (tag in ('path', 'text', 'use', 'rect', 'circle') and
 139                 attr in ('x', 'cx', 'y', 'cy') and
 140                 value == '0'):
 141                 del attrs[i]
 142                 continue
 143             if attr == 'self':
 144                 self._self.append([0, value])
 145                 del attrs[i]
 146                 continue
 147             if value:
 148                 typing.cast(typing.List, attrs)[i] = (
 149                     attr, self._transform_attr(tag, attr, value))
 150             i += 1
 151
 152     def _transform_attr(self, tag: str, attr: str, value: str) -> str:
 153         if self._static_mappings:
 154             ret = self._static_mappings.get(value)
 155             if ret:
 156                 return ret
 157
 158         if self._src_dir and tag == 'img' and attr == 'src':
 159             return _insert_data(self._src_dir, value).decode('utf-8')
 160
 161         value = re.sub(r'\s+', ' ', value.strip())
 162         if attr == 'style':
 163             value = self._minify_css(value)
 164         elif attr == 'd' and tag == 'path':
 165             # In SVG’s D attribute of PATH element the only required white-space
 166             # is between numbers (except space is not necessary before minus
 167             # sign).
 168             value = re.sub(r' ?([-a-zA-Z,]) ?', r'\1', value)
 169         elif '%s %s' % (tag, attr) in ('link media', 'area coords',
 170                                      'meta content'):
 171             # Comma separated lists, remove unnecessary spaces around commas.
 172             value = re.sub(r' ?, ?', ',', value)
 173         elif attr in ('href', 'src') and tag != 'base':
 174             if value.startswith('https://'):
 175                 value = value[6:]
 176             if value.startswith('//mina86.com/'):
 177                 value = value[12:]
 178             if value.startswith('/self'):
 179                 s = self._self[-1][1]
 180                 assert s is not None
 181                 value = s + value[5:]
 182         return value
 183
 184     def handle_data(self, data):
 185         if self._tag_stack and self._tag_stack[0][0] == 'style':
 186             self._data_buffer.append(self._minify_css(data))
 187             return
 188         super().handle_data(data)
 189
 190
 191 def _html_tag_re(tag, **kw):
 192     is_open = kw.get('open')
 193     is_close = kw.get('close')
 194     assert not (is_open and is_close)
 195
 196     fmt = r'</{tag}>' if is_close else r'<{slash}{tag}\b[^>]*>'
 197     return fmt.format(slash='' if is_open else '/?', tag=tag)
 198
 199
 200 _MINIFY_HTML_RE = re.compile(r'''
 201       {space} ( {block_tag}  ) {space}?
 202     |         ( {block_tag}  ) {space}
 203     | {space} ( {text_open}  )
 204     |         ( {text_close} ) {space}
 205     | {space} ( {pre_open}   ) {blank}*
 206     |         ( {pre_open}   ) {blank}+
 207     | \s+     ( {pre_close}  ) {space}?
 208     |         ( {pre_close}  ) {space}
 209 '''.format(
 210     space=r'\ ',
 211     blank=r'(?:[ \t]*\n)',
 212
 213     block_tag=_html_tag_re('(?:%s)' % '|'.join((
 214         'address', 'article', 'aside', 'base', 'blockquote', 'body', 'br',
 215         'canvas', 'caption', 'col(?:group)?', 'd[dlt]', 'div',
 216         'fig(?:caption|ure)', 'footer', 'form', 'h[1-6r]', 'head(?:er)?',
 217         'hgroup', 'iframe', 'li(?:nk)?', 'main', 'meta', 'nav', 'noscript',
 218         '[ou]l', 'opt(?:group|ion)', 'p', 'script', 'section', 'style',
 219         't(?:able|head|body|foot|[dhr]|itle)',
 220
 221         # SVG elements:
 222         'svg', 'rect', 'circle', 'g', 'path',
 223     ))),
 224     text_open=_html_tag_re('text', open=True),
 225     text_close=_html_tag_re('text', close=True),
 226     pre_open='{pre}(?:{code})?'.format(pre=_html_tag_re('pre', open=True),
 227                                        code=_html_tag_re('code', open=True)),
 228     pre_close='(?:{code})?{pre}'.format(pre=_html_tag_re('pre', close=True),
 229                                         code=_html_tag_re('code', close=True)),
 230 ), re.VERBOSE)
 231
 232 def minify_html(data: str, **kw: typing.Any) -> str:
 233     def make_parser(*args: typing.Any, **kwargs: typing.Any) -> HTMLMinParser:
 234         kwargs.update(kw)
 235         return HTMLMinParser(*args, **kwargs)
 236
 237     data = htmlmin.minify(data,
 238                           remove_comments=True,
 239                           remove_empty_space=False,
 240                           remove_all_empty_space=False,
 241                           reduce_empty_attributes=True,
 242                           reduce_boolean_attributes=True,
 243                           remove_optional_attribute_quotes=True,
 244                           cls=make_parser).strip()
 245
 246     def pick_group(m):
 247         for i in range(1, m.lastindex + 1):
 248             if (grp := m.group(i)) is not None:
 249                 return grp
 250
 251     data = _MINIFY_HTML_RE.sub(pick_group, data)
 252
 253     return data