1 """HTML utilities suitable for global use."""
6 from django
.utils
.safestring
import SafeData
, mark_safe
7 from django
.utils
.encoding
import force_unicode
8 from django
.utils
.functional
import allow_lazy
9 from django
.utils
.http
import urlquote
11 # Configuration for urlize() function.
12 LEADING_PUNCTUATION
= ['(', '<', '<']
13 TRAILING_PUNCTUATION
= ['.', ',', ')', '>', '\n', '>']
15 # List of possible strings used for bullets in bulleted lists.
16 DOTS
= ['·', '*', '\xe2\x80\xa2', '•', '•', '•']
18 unencoded_ampersands_re
= re
.compile(r
'&(?!(\w+|#\d+);)')
19 word_split_re
= re
.compile(r
'(\s+)')
20 punctuation_re
= re
.compile('^(?P<lead>(?:%s)*)(?P<middle>.*?)(?P<trail>(?:%s)*)$' % \
21 ('|'.join([re
.escape(x
) for x
in LEADING_PUNCTUATION
]),
22 '|'.join([re
.escape(x
) for x
in TRAILING_PUNCTUATION
])))
23 simple_email_re
= re
.compile(r
'^\S+@[a-zA-Z0-9._-]+\.[a-zA-Z0-9._-]+$')
24 link_target_attribute_re
= re
.compile(r
'(<a [^>]*?)target=[^\s>]+')
25 html_gunk_re
= re
.compile(r
'(?:<br clear="all">|<i><\/i>|<b><\/b>|<em><\/em>|<strong><\/strong>|<\/?smallcaps>|<\/?uppercase>)', re
.IGNORECASE
)
26 hard_coded_bullets_re
= re
.compile(r
'((?:<p>(?:%s).*?[a-zA-Z].*?</p>\s*)+)' % '|'.join([re
.escape(x
) for x
in DOTS
]), re
.DOTALL
)
27 trailing_empty_content_re
= re
.compile(r
'(?:<p>(?: |\s|<br \/>)*?</p>\s*)+\Z')
28 del x
# Temporary variable
31 """Returns the given HTML with ampersands, quotes and carets encoded."""
32 return mark_safe(force_unicode(html
).replace('&', '&').replace('<', '<').replace('>', '>').replace('"', '"').replace("'", '''))
33 escape
= allow_lazy(escape
, unicode)
35 def conditional_escape(html
):
37 Similar to escape(), except that it doesn't operate on pre-escaped strings.
39 if isinstance(html
, SafeData
):
44 def linebreaks(value
, autoescape
=False):
45 """Converts newlines into <p> and <br />s."""
46 value
= re
.sub(r
'\r\n|\r|\n', '\n', force_unicode(value
)) # normalize newlines
47 paras
= re
.split('\n{2,}', value
)
49 paras
= [u
'<p>%s</p>' % escape(p
.strip()).replace('\n', '<br />') for p
in paras
]
51 paras
= [u
'<p>%s</p>' % p
.strip().replace('\n', '<br />') for p
in paras
]
52 return u
'\n\n'.join(paras
)
53 linebreaks
= allow_lazy(linebreaks
, unicode)
55 def strip_tags(value
):
56 """Returns the given HTML with all tags stripped."""
57 return re
.sub(r
'<[^>]*?>', '', force_unicode(value
))
58 strip_tags
= allow_lazy(strip_tags
)
60 def strip_spaces_between_tags(value
):
61 """Returns the given HTML with spaces between tags removed."""
62 return re
.sub(r
'>\s+<', '><', force_unicode(value
))
63 strip_spaces_between_tags
= allow_lazy(strip_spaces_between_tags
, unicode)
65 def strip_entities(value
):
66 """Returns the given HTML with all entities (&something;) stripped."""
67 return re
.sub(r
'&(?:\w+|#\d+);', '', force_unicode(value
))
68 strip_entities
= allow_lazy(strip_entities
, unicode)
70 def fix_ampersands(value
):
71 """Returns the given HTML with all unencoded ampersands encoded correctly."""
72 return unencoded_ampersands_re
.sub('&', force_unicode(value
))
73 fix_ampersands
= allow_lazy(fix_ampersands
, unicode)
75 def urlize(text
, trim_url_limit
=None, nofollow
=False, autoescape
=False):
77 Converts any URLs in text into clickable links.
79 Works on http://, https://, www. links and links ending in .org, .net or
80 .com. Links can have trailing punctuation (periods, commas, close-parens)
81 and leading punctuation (opening parens) and it'll still do the right
84 If trim_url_limit is not None, the URLs in link text longer than this limit
85 will truncated to trim_url_limit-3 characters and appended with an elipsis.
87 If nofollow is True, the URLs in link text will get a rel="nofollow"
90 If autoescape is True, the link text and URLs will get autoescaped.
92 trim_url
= lambda x
, limit
=trim_url_limit
: limit
is not None and (len(x
) > limit
and ('%s...' % x
[:max(0, limit
- 3)])) or x
93 safe_input
= isinstance(text
, SafeData
)
94 words
= word_split_re
.split(force_unicode(text
))
95 nofollow_attr
= nofollow
and ' rel="nofollow"' or ''
96 for i
, word
in enumerate(words
):
98 if '.' in word
or '@' in word
or ':' in word
:
99 match
= punctuation_re
.match(word
)
101 lead
, middle
, trail
= match
.groups()
102 # Make URL we want to point to.
104 if middle
.startswith('http://') or middle
.startswith('https://'):
105 url
= urlquote(middle
, safe
='/&=:;#?+*')
106 elif middle
.startswith('www.') or ('@' not in middle
and \
107 middle
and middle
[0] in string
.ascii_letters
+ string
.digits
and \
108 (middle
.endswith('.org') or middle
.endswith('.net') or middle
.endswith('.com'))):
109 url
= urlquote('http://%s' % middle
, safe
='/&=:;#?+*')
110 elif '@' in middle
and not ':' in middle
and simple_email_re
.match(middle
):
111 url
= 'mailto:%s' % middle
115 trimmed
= trim_url(middle
)
116 if autoescape
and not safe_input
:
117 lead
, trail
= escape(lead
), escape(trail
)
118 url
, trimmed
= escape(url
), escape(trimmed
)
119 middle
= '<a href="%s"%s>%s</a>' % (url
, nofollow_attr
, trimmed
)
120 words
[i
] = mark_safe('%s%s%s' % (lead
, middle
, trail
))
123 words
[i
] = mark_safe(word
)
125 words
[i
] = escape(word
)
127 words
[i
] = mark_safe(word
)
129 words
[i
] = escape(word
)
130 return u
''.join(words
)
131 urlize
= allow_lazy(urlize
, unicode)
133 def clean_html(text
):
135 Clean the given HTML. Specifically, do the following:
136 * Convert <b> and <i> to <strong> and <em>.
137 * Encode all ampersands correctly.
138 * Remove all "target" attributes from <a> tags.
139 * Remove extraneous HTML, such as presentational tags that open and
140 immediately close and <br clear="all">.
141 * Convert hard-coded bullets into HTML unordered lists.
142 * Remove stuff like "<p> </p>", but only if it's at the
145 from django
.utils
.text
import normalize_newlines
146 text
= normalize_newlines(force_unicode(text
))
147 text
= re
.sub(r
'<(/?)\s*b\s*>', '<\\1strong>', text
)
148 text
= re
.sub(r
'<(/?)\s*i\s*>', '<\\1em>', text
)
149 text
= fix_ampersands(text
)
150 # Remove all target="" attributes from <a> tags.
151 text
= link_target_attribute_re
.sub('\\1', text
)
152 # Trim stupid HTML such as <br clear="all">.
153 text
= html_gunk_re
.sub('', text
)
154 # Convert hard-coded bullets into HTML unordered lists.
155 def replace_p_tags(match
):
156 s
= match
.group().replace('</p>', '</li>')
158 s
= s
.replace('<p>%s' % d
, '<li>')
159 return u
'<ul>\n%s\n</ul>' % s
160 text
= hard_coded_bullets_re
.sub(replace_p_tags
, text
)
161 # Remove stuff like "<p> </p>", but only if it's at the bottom
163 text
= trailing_empty_content_re
.sub('', text
)
165 clean_html
= allow_lazy(clean_html
, unicode)