1 """HTML utilities suitable for global use."""
8 from django
.utils
.safestring
import SafeData
, mark_safe
9 from django
.utils
.encoding
import smart_str
, force_unicode
10 from django
.utils
.functional
import allow_lazy
11 from django
.utils
.text
import normalize_newlines
13 # Configuration for urlize() function.
14 TRAILING_PUNCTUATION
= ['.', ',', ':', ';']
15 WRAPPING_PUNCTUATION
= [('(', ')'), ('<', '>'), ('<', '>')]
17 # List of possible strings used for bullets in bulleted lists.
18 DOTS
= [u
'·', u
'*', u
'\u2022', u
'•', u
'•', u
'•']
20 unencoded_ampersands_re
= re
.compile(r
'&(?!(\w+|#\d+);)')
21 unquoted_percents_re
= re
.compile(r
'%(?![0-9A-Fa-f]{2})')
22 word_split_re
= re
.compile(r
'(\s+)')
23 simple_url_re
= re
.compile(r
'^https?://\w')
24 simple_url_2_re
= re
.compile(r
'^www\.|^(?!http)\w[^@]+\.(com|edu|gov|int|mil|net|org)$')
25 simple_email_re
= re
.compile(r
'^\S+@\S+\.\S+$')
26 link_target_attribute_re
= re
.compile(r
'(<a [^>]*?)target=[^\s>]+')
27 html_gunk_re
= re
.compile(r
'(?:<br clear="all">|<i><\/i>|<b><\/b>|<em><\/em>|<strong><\/strong>|<\/?smallcaps>|<\/?uppercase>)', re
.IGNORECASE
)
28 hard_coded_bullets_re
= re
.compile(r
'((?:<p>(?:%s).*?[a-zA-Z].*?</p>\s*)+)' % '|'.join([re
.escape(x
) for x
in DOTS
]), re
.DOTALL
)
29 trailing_empty_content_re
= re
.compile(r
'(?:<p>(?: |\s|<br \/>)*?</p>\s*)+\Z')
30 del x
# Temporary variable
34 Returns the given HTML with ampersands, quotes and angle brackets encoded.
36 return mark_safe(force_unicode(html
).replace('&', '&').replace('<', '<').replace('>', '>').replace('"', '"').replace("'", '''))
37 escape
= allow_lazy(escape
, unicode)
49 (u
'\u2028', r
'\u2028'),
50 (u
'\u2029', r
'\u2029')
53 # Escape every ASCII character with a value less than 32.
54 _js_escapes
= (_base_js_escapes
+
55 tuple([('%c' % z
, '\\u%04X' % z
) for z
in range(32)]))
58 """Hex encodes characters for use in JavaScript strings."""
59 for bad
, good
in _js_escapes
:
60 value
= mark_safe(force_unicode(value
).replace(bad
, good
))
62 escapejs
= allow_lazy(escapejs
, unicode)
64 def conditional_escape(html
):
66 Similar to escape(), except that it doesn't operate on pre-escaped strings.
68 if isinstance(html
, SafeData
):
73 def linebreaks(value
, autoescape
=False):
74 """Converts newlines into <p> and <br />s."""
75 value
= normalize_newlines(value
)
76 paras
= re
.split('\n{2,}', value
)
78 paras
= [u
'<p>%s</p>' % escape(p
).replace('\n', '<br />') for p
in paras
]
80 paras
= [u
'<p>%s</p>' % p
.replace('\n', '<br />') for p
in paras
]
81 return u
'\n\n'.join(paras
)
82 linebreaks
= allow_lazy(linebreaks
, unicode)
84 def strip_tags(value
):
85 """Returns the given HTML with all tags stripped."""
86 return re
.sub(r
'<[^>]*?>', '', force_unicode(value
))
87 strip_tags
= allow_lazy(strip_tags
)
89 def strip_spaces_between_tags(value
):
90 """Returns the given HTML with spaces between tags removed."""
91 return re
.sub(r
'>\s+<', '><', force_unicode(value
))
92 strip_spaces_between_tags
= allow_lazy(strip_spaces_between_tags
, unicode)
94 def strip_entities(value
):
95 """Returns the given HTML with all entities (&something;) stripped."""
96 return re
.sub(r
'&(?:\w+|#\d+);', '', force_unicode(value
))
97 strip_entities
= allow_lazy(strip_entities
, unicode)
99 def fix_ampersands(value
):
100 """Returns the given HTML with all unencoded ampersands encoded correctly."""
101 return unencoded_ampersands_re
.sub('&', force_unicode(value
))
102 fix_ampersands
= allow_lazy(fix_ampersands
, unicode)
104 def smart_urlquote(url
):
105 "Quotes a URL if it isn't already quoted."
106 # Handle IDN before quoting.
107 scheme
, netloc
, path
, query
, fragment
= urlparse
.urlsplit(url
)
109 netloc
= netloc
.encode('idna') # IDN -> ACE
110 except UnicodeError: # invalid domain part
113 url
= urlparse
.urlunsplit((scheme
, netloc
, path
, query
, fragment
))
115 # An URL is considered unquoted if it contains no % characters or
116 # contains a % not followed by two hexadecimal digits. See #9655.
117 if '%' not in url
or unquoted_percents_re
.search(url
):
118 # See http://bugs.python.org/issue2637
119 url
= urllib
.quote(smart_str(url
), safe
='!*\'();:@&=+$,/?#[]~')
121 return force_unicode(url
)
123 def urlize(text
, trim_url_limit
=None, nofollow
=False, autoescape
=False):
125 Converts any URLs in text into clickable links.
127 Works on http://, https://, www. links, and also on links ending in one of
128 the original seven gTLDs (.com, .edu, .gov, .int, .mil, .net, and .org).
129 Links can have trailing punctuation (periods, commas, close-parens) and
130 leading punctuation (opening parens) and it'll still do the right thing.
132 If trim_url_limit is not None, the URLs in link text longer than this limit
133 will truncated to trim_url_limit-3 characters and appended with an elipsis.
135 If nofollow is True, the URLs in link text will get a rel="nofollow"
138 If autoescape is True, the link text and URLs will get autoescaped.
140 trim_url
= lambda x
, limit
=trim_url_limit
: limit
is not None and (len(x
) > limit
and ('%s...' % x
[:max(0, limit
- 3)])) or x
141 safe_input
= isinstance(text
, SafeData
)
142 words
= word_split_re
.split(force_unicode(text
))
143 for i
, word
in enumerate(words
):
145 if '.' in word
or '@' in word
or ':' in word
:
146 # Deal with punctuation.
147 lead
, middle
, trail
= '', word
, ''
148 for punctuation
in TRAILING_PUNCTUATION
:
149 if middle
.endswith(punctuation
):
150 middle
= middle
[:-len(punctuation
)]
151 trail
= punctuation
+ trail
152 for opening
, closing
in WRAPPING_PUNCTUATION
:
153 if middle
.startswith(opening
):
154 middle
= middle
[len(opening
):]
155 lead
= lead
+ opening
156 # Keep parentheses at the end only if they're balanced.
157 if (middle
.endswith(closing
)
158 and middle
.count(closing
) == middle
.count(opening
) + 1):
159 middle
= middle
[:-len(closing
)]
160 trail
= closing
+ trail
162 # Make URL we want to point to.
164 nofollow_attr
= ' rel="nofollow"' if nofollow
else ''
165 if simple_url_re
.match(middle
):
166 url
= smart_urlquote(middle
)
167 elif simple_url_2_re
.match(middle
):
168 url
= smart_urlquote('http://%s' % middle
)
169 elif not ':' in middle
and simple_email_re
.match(middle
):
170 local
, domain
= middle
.rsplit('@', 1)
172 domain
= domain
.encode('idna')
175 url
= 'mailto:%s@%s' % (local
, domain
)
180 trimmed
= trim_url(middle
)
181 if autoescape
and not safe_input
:
182 lead
, trail
= escape(lead
), escape(trail
)
183 url
, trimmed
= escape(url
), escape(trimmed
)
184 middle
= '<a href="%s"%s>%s</a>' % (url
, nofollow_attr
, trimmed
)
185 words
[i
] = mark_safe('%s%s%s' % (lead
, middle
, trail
))
188 words
[i
] = mark_safe(word
)
190 words
[i
] = escape(word
)
192 words
[i
] = mark_safe(word
)
194 words
[i
] = escape(word
)
195 return u
''.join(words
)
196 urlize
= allow_lazy(urlize
, unicode)
198 def clean_html(text
):
200 Clean the given HTML. Specifically, do the following:
201 * Convert <b> and <i> to <strong> and <em>.
202 * Encode all ampersands correctly.
203 * Remove all "target" attributes from <a> tags.
204 * Remove extraneous HTML, such as presentational tags that open and
205 immediately close and <br clear="all">.
206 * Convert hard-coded bullets into HTML unordered lists.
207 * Remove stuff like "<p> </p>", but only if it's at the
210 from django
.utils
.text
import normalize_newlines
211 text
= normalize_newlines(force_unicode(text
))
212 text
= re
.sub(r
'<(/?)\s*b\s*>', '<\\1strong>', text
)
213 text
= re
.sub(r
'<(/?)\s*i\s*>', '<\\1em>', text
)
214 text
= fix_ampersands(text
)
215 # Remove all target="" attributes from <a> tags.
216 text
= link_target_attribute_re
.sub('\\1', text
)
217 # Trim stupid HTML such as <br clear="all">.
218 text
= html_gunk_re
.sub('', text
)
219 # Convert hard-coded bullets into HTML unordered lists.
220 def replace_p_tags(match
):
221 s
= match
.group().replace(u
'</p>', u
'</li>')
223 s
= s
.replace(u
'<p>%s' % d
, u
'<li>')
224 return u
'<ul>\n%s\n</ul>' % s
225 text
= hard_coded_bullets_re
.sub(replace_p_tags
, text
)
226 # Remove stuff like "<p> </p>", but only if it's at the bottom
228 text
= trailing_empty_content_re
.sub(u
'', text
)
230 clean_html
= allow_lazy(clean_html
, unicode)