1 # CSS and HTML files pre-processor. -*- coding: utf-8 -*-
2 # Copyright 2016 by Michał Nazarewicz <mina86@mina86.com>
4 # This program is free software: you can redistribute it and/or modify
5 # it under the terms of the GNU General Public License as published by
6 # the Free Software Foundation, either version 3 of the License, or (at
7 # your option) any later version.
9 # This program is distributed in the hope that it will be useful, but
10 # WITHOUT ANY WARRANTY; without even the implied warranty of
11 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
12 # General Public License for more details.
14 # You should have received a copy of the GNU General Public License
15 # along with this program. If not, see <http://www.gnu.org/licenses/>.
17 # Unless required by applicable law or agreed to in writing, software
18 # distributed under the Apache License is distributed on an "AS IS"
19 # BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
20 # implied. See the Apache License for the specific language governing
21 # permissions and limitations under the License.
29 sys
.path
.insert(1, os
.path
.abspath(os
.path
.join(os
.path
.dirname(__file__
),
30 '..', '..', 'htmlmin')))
34 _MIMETYPES
: typing
.Dict
[str, bytes
] = {
35 '.jpg': b
'image/jpeg',
37 '.svg': b
'image/svg+xml',
40 def _encode_base64(mime
: bytes
, data
: bytes
) -> bytes
:
41 data
= base64
.standard_b64encode(data
)
42 return b
'data:%s;base64,%s' % (mime
, data
)
44 def _encode_string(mime
: bytes
, data
: bytes
, css
: bool) -> bytes
:
45 chars
= r
'\0-\x1f\x80-\xff%#'
47 # In HTML mode we don’t have to URL escape greater than sign, apostrophe
48 # and quote in the string since those can be handled via HTML entities.
49 # However, URL encoding is just three-character long while the entities
51 if data
.count(b
"'") > data
.count(b
'"'):
56 str_data
= re
.sub('[{}]'.format(chars
),
57 lambda m
: '%%%02x' % ord(m
.group(0)),
58 data
.decode('utf-8').strip())
60 def fmt(data
: str, quote
: str = '') -> bytes
:
61 quote
= quote
.encode('ascii')
62 return b
'%sdata:%s,%s%s' % (quote
, mime
, data
.encode('ascii'), quote
)
67 str_data
.replace('\n', '\\n')
68 x
, y
= [fmt(str_data
.replace(q
, '\\' + q
), q
) for q
in '\'"']
69 return x
if len(x
) < len(y
) else y
71 def _insert_data(src_dir
: str, path
: str, css
: bool = False) -> bytes
:
72 _
, ext
= os
.path
.splitext(path
)
73 mime
= _MIMETYPES
[ext
]
74 data
= open(os
.path
.join(src_dir
, path
), 'rb').read()
76 encoded
= _encode_base64(mime
, data
)
81 y
= _encode_string(mime
, data
, css
)
82 return x
if len(x
) < len(y
) else y
85 def _map_static(mappings
: typing
.Dict
[str, str], path
: str) -> str:
86 return mappings
.get(path
, path
)
89 def process_css(data
: bytes
, src_dir
: str,
90 mappings
: typing
.Dict
[str, str]) -> bytes
:
91 data
= re
.sub(rb
'DATA<([^<>]+)>',
92 lambda m
: _insert_data(src_dir
, m
.group(1).decode('utf-8'),
95 data
= re
.sub(rb
'/d/[-_a-zA-Z0-9.]*',
96 lambda m
: _map_static(mappings
, m
.group(0).decode('utf-8')).encode('utf-8'),
100 _Attributes
= typing
.Sequence
[typing
.Tuple
[str, typing
.Optional
[str]]]
102 class HTMLMinParser(htmlmin
.parser
.HTMLMinParser
):
105 def _minify_css(data
):
106 # CSS style, remove unnecessary spaces after punctuation marks.
107 # This is very likely to break non-trivial rules.
108 data
= re
.sub(r
'\s+', ' ', data
.strip())
109 return re
.sub(r
'\s*([:;,{}])\s*', r
'\1', data
)
111 def __init__(self
, *args
: typing
.Any
, **kw
: typing
.Any
):
112 self
._static
_mappings
= kw
.pop('static_mappings', None)
113 self
._src
_dir
= kw
.pop('src_dir', None)
114 self
._self
= [[1, kw
.pop('self_url', None)]]
115 super().__init
__(*args
, **kw
)
117 def handle_starttag(self
, tag
: str, attrs
: _Attributes
) -> None:
118 self
._transform
_attrs
(tag
, attrs
)
119 self
._self
[-1][0] += 1
120 super().handle_starttag(tag
, attrs
)
122 def handle_startendtag(self
, tag
: str, attrs
: _Attributes
) -> None:
123 self
._transform
_attrs
(tag
, attrs
)
124 if not self
._self
[-1][0]:
126 super().handle_startendtag(tag
, attrs
)
128 def handle_endtag(self
, tag
):
129 self
._self
[-1][0] -= 1
130 if not self
._self
[-1][0]:
132 super().handle_endtag(tag
)
134 def _transform_attrs(self
, tag
: str, attrs
: _Attributes
) -> None:
136 while i
< len(attrs
):
137 attr
, value
= attrs
[i
]
138 if (tag
in ('path', 'text', 'use', 'rect', 'circle') and
139 attr
in ('x', 'cx', 'y', 'cy') and
144 self
._self
.append([0, value
])
148 typing
.cast(typing
.List
, attrs
)[i
] = (
149 attr
, self
._transform
_attr
(tag
, attr
, value
))
152 def _transform_attr(self
, tag
: str, attr
: str, value
: str) -> str:
153 if self
._static
_mappings
:
154 ret
= self
._static
_mappings
.get(value
)
158 if self
._src
_dir
and tag
== 'img' and attr
== 'src':
159 return _insert_data(self
._src
_dir
, value
).decode('utf-8')
161 value
= re
.sub(r
'\s+', ' ', value
.strip())
163 value
= self
._minify
_css
(value
)
164 elif attr
== 'd' and tag
== 'path':
165 # In SVG’s D attribute of PATH element the only required white-space
166 # is between numbers (except space is not necessary before minus
168 value
= re
.sub(r
' ?([-a-zA-Z,]) ?', r
'\1', value
)
169 elif '%s %s' % (tag
, attr
) in ('link media', 'area coords',
171 # Comma separated lists, remove unnecessary spaces around commas.
172 value
= re
.sub(r
' ?, ?', ',', value
)
173 elif attr
in ('href', 'src') and tag
!= 'base':
174 if value
.startswith('https://'):
176 if value
.startswith('//mina86.com/'):
178 if value
.startswith('/self'):
179 s
= self
._self
[-1][1]
181 value
= s
+ value
[5:]
184 def handle_data(self
, data
):
185 if self
._tag
_stack
and self
._tag
_stack
[0][0] == 'style':
186 self
._data
_buffer
.append(self
._minify
_css
(data
))
188 super().handle_data(data
)
191 def _html_tag_re(tag
, **kw
):
192 is_open
= kw
.get('open')
193 is_close
= kw
.get('close')
194 assert not (is_open
and is_close
)
196 fmt
= r
'</{tag}>' if is_close
else r
'<{slash}{tag}\b[^>]*>'
197 return fmt
.format(slash
='' if is_open
else '/?', tag
=tag
)
200 _MINIFY_HTML_RE
= re
.compile(r
'''
201 {space} ( {block_tag} ) {space}?
202 | ( {block_tag} ) {space}
203 | {space} ( {text_open} )
204 | ( {text_close} ) {space}
205 | {space} ( {pre_open} ) {blank}*
206 | ( {pre_open} ) {blank}+
207 | \s+ ( {pre_close} ) {space}?
208 | ( {pre_close} ) {space}
211 blank
=r
'(?:[ \t]*\n)',
213 block_tag
=_html_tag_re('(?:%s)' % '|'.join((
214 'address', 'article', 'aside', 'base', 'blockquote', 'body', 'br',
215 'canvas', 'caption', 'col(?:group)?', 'd[dlt]', 'div',
216 'fig(?:caption|ure)', 'footer', 'form', 'h[1-6r]', 'head(?:er)?',
217 'hgroup', 'iframe', 'li(?:nk)?', 'main', 'meta', 'nav', 'noscript',
218 '[ou]l', 'opt(?:group|ion)', 'p', 'script', 'section', 'style',
219 't(?:able|head|body|foot|[dhr]|itle)',
222 'svg', 'rect', 'circle', 'g', 'path',
224 text_open
=_html_tag_re('text', open=True),
225 text_close
=_html_tag_re('text', close
=True),
226 pre_open
='{pre}(?:{code})?'.format(pre
=_html_tag_re('pre', open=True),
227 code
=_html_tag_re('code', open=True)),
228 pre_close
='(?:{code})?{pre}'.format(pre
=_html_tag_re('pre', close
=True),
229 code
=_html_tag_re('code', close
=True)),
232 def minify_html(data
: str, **kw
: typing
.Any
) -> str:
233 def make_parser(*args
: typing
.Any
, **kwargs
: typing
.Any
) -> HTMLMinParser
:
235 return HTMLMinParser(*args
, **kwargs
)
237 data
= htmlmin
.minify(data
,
238 remove_comments
=True,
239 remove_empty_space
=False,
240 remove_all_empty_space
=False,
241 reduce_empty_attributes
=True,
242 reduce_boolean_attributes
=True,
243 remove_optional_attribute_quotes
=True,
244 cls
=make_parser
).strip()
247 for i
in range(1, m
.lastindex
+ 1):
248 if (grp
:= m
.group(i
)) is not None:
251 data
= _MINIFY_HTML_RE
.sub(pick_group
, data
)