2 from html
.entities
import entitydefs
4 from django
.utils
.safestring
import mark_safe
5 from django
import template
6 from django
.utils
.safestring
import mark_safe
9 register
= template
.Library()
13 def remove_html_tags(html
):
14 # If we would want more speed, we could make these global
15 re_strip_tags
= re
.compile("<[^>]*>")
16 re_unicode_entities
= re
.compile(r
"&#(\d{2,4});")
17 re_html_entities
= re
.compile("&(.{2,8});")
18 re_newline_tags
= re
.compile("(<br[^>]*>|<[/]?ul[^>]*>|</li>)", re
.I
)
19 re_listing_tags
= re
.compile("<li[^>]*>", re
.I
)
23 # Convert common HTML elements to their text equivalent
24 result
= re_newline_tags
.sub("\n", result
)
25 result
= re_listing_tags
.sub("\n * ", result
)
26 result
= re
.sub("<[Pp]>", "\n\n", result
)
28 # Remove all HTML/XML tags from the string
29 result
= re_strip_tags
.sub("", result
)
31 # Convert numeric XML entities to their unicode character
32 result
= re_unicode_entities
.sub(lambda x
: chr(int(x
.group(1))), result
)
34 # Convert named HTML entities to their unicode character
35 result
= re_html_entities
.sub(
36 lambda x
: str(entitydefs
.get(x
.group(1), ""), "iso-8859-1"), result
39 # Convert more than two newlines to two newlines
40 result
= re
.sub("([\r\n]{2})([\r\n])+", "\\1", result
)
42 return mark_safe(result
.strip())