2 from django
.conf
import settings
3 from django
.utils
.encoding
import force_unicode
4 from django
.utils
.functional
import allow_lazy
5 from django
.utils
.translation
import ugettext_lazy
6 from htmlentitydefs
import name2codepoint
8 # Capitalizes the first letter of a string.
9 capfirst
= lambda x
: x
and force_unicode(x
)[0].upper() + force_unicode(x
)[1:]
10 capfirst
= allow_lazy(capfirst
, unicode)
12 def wrap(text
, width
):
14 A word-wrap function that preserves existing line breaks and most spaces in
15 the text. Expects that existing line breaks are posix newlines.
17 text
= force_unicode(text
)
19 it
= iter(text
.split(' '))
22 pos
= len(word
) - word
.rfind('\n') - 1
25 lines
= word
.split('\n')
28 pos
+= len(lines
[0]) + 1
37 return u
''.join(_generator())
38 wrap
= allow_lazy(wrap
, unicode)
40 def truncate_words(s
, num
):
41 "Truncates a string after a certain number of words."
45 if len(words
) > length
:
46 words
= words
[:length
]
47 if not words
[-1].endswith('...'):
49 return u
' '.join(words
)
50 truncate_words
= allow_lazy(truncate_words
, unicode)
52 def truncate_html_words(s
, num
):
54 Truncates html to a certain number of words (not counting tags and
55 comments). Closes opened tags if they were correctly closed in the given
62 html4_singlets
= ('br', 'col', 'link', 'base', 'img', 'param', 'area', 'hr', 'input')
63 # Set up regular expressions
64 re_words
= re
.compile(r
'&.*?;|<.*?>|(\w[\w-]*)', re
.U
)
65 re_tag
= re
.compile(r
'<(/)?([^ ]+?)(?: (/)| .*?)?>')
66 # Count non-HTML words and keep note of open tags
71 while words
<= length
:
72 m
= re_words
.search(s
, pos
)
74 # Checked through whole string
78 # It's an actual non-HTML word
84 tag
= re_tag
.match(m
.group(0))
85 if not tag
or ellipsis_pos
:
86 # Don't worry about non tags or tags after our truncate point
88 closing_tag
, tagname
, self_closing
= tag
.groups()
89 tagname
= tagname
.lower() # Element names are always case-insensitive
90 if self_closing
or tagname
in html4_singlets
:
93 # Check for match in open tags list
95 i
= open_tags
.index(tagname
)
99 # SGML: An end tag closes, back to the matching start tag, all unclosed intervening start tags with omitted end tags
100 open_tags
= open_tags
[i
+1:]
102 # Add it to the start of the open tags list
103 open_tags
.insert(0, tagname
)
105 # Don't try to close tags if we don't need to truncate
107 out
= s
[:ellipsis_pos
] + ' ...'
108 # Close any tags still open
109 for tag
in open_tags
:
113 truncate_html_words
= allow_lazy(truncate_html_words
, unicode)
115 def get_valid_filename(s
):
117 Returns the given string converted to a string that can be used for a clean
118 filename. Specifically, leading and trailing spaces are removed; other
119 spaces are converted to underscores; and all non-filename-safe characters
121 >>> get_valid_filename("john's portrait in 2004.jpg")
122 u'johns_portrait_in_2004.jpg'
124 s
= force_unicode(s
).strip().replace(' ', '_')
125 return re
.sub(r
'[^-A-Za-z0-9_.]', '', s
)
126 get_valid_filename
= allow_lazy(get_valid_filename
, unicode)
128 def get_text_list(list_
, last_word
=ugettext_lazy(u
'or')):
130 >>> get_text_list(['a', 'b', 'c', 'd'])
132 >>> get_text_list(['a', 'b', 'c'], 'and')
134 >>> get_text_list(['a', 'b'], 'and')
136 >>> get_text_list(['a'])
138 >>> get_text_list([])
141 if len(list_
) == 0: return u
''
142 if len(list_
) == 1: return force_unicode(list_
[0])
143 return u
'%s %s %s' % (', '.join([force_unicode(i
) for i
in list_
][:-1]), force_unicode(last_word
), force_unicode(list_
[-1]))
144 get_text_list
= allow_lazy(get_text_list
, unicode)
146 def normalize_newlines(text
):
147 return force_unicode(re
.sub(r
'\r\n|\r|\n', '\n', text
))
148 normalize_newlines
= allow_lazy(normalize_newlines
, unicode)
150 def recapitalize(text
):
151 "Recapitalizes text, placing caps after end-of-sentence punctuation."
152 text
= force_unicode(text
).lower()
153 capsRE
= re
.compile(r
'(?:^|(?<=[\.\?\!] ))([a-z])')
154 text
= capsRE
.sub(lambda x
: x
.group(1).upper(), text
)
156 recapitalize
= allow_lazy(recapitalize
)
158 def phone2numeric(phone
):
159 "Converts a phone number with letters into its numeric equivalent."
160 letters
= re
.compile(r
'[A-PR-Y]', re
.I
)
161 char2number
= lambda m
: {'a': '2', 'c': '2', 'b': '2', 'e': '3',
162 'd': '3', 'g': '4', 'f': '3', 'i': '4', 'h': '4', 'k': '5',
163 'j': '5', 'm': '6', 'l': '5', 'o': '6', 'n': '6', 'p': '7',
164 's': '7', 'r': '7', 'u': '8', 't': '8', 'w': '9', 'v': '8',
165 'y': '9', 'x': '9'}.get(m
.group(0).lower())
166 return letters
.sub(char2number
, phone
)
167 phone2numeric
= allow_lazy(phone2numeric
)
169 # From http://www.xhaus.com/alan/python/httpcomp.html#gzip
170 # Used with permission.
171 def compress_string(s
):
172 import cStringIO
, gzip
173 zbuf
= cStringIO
.StringIO()
174 zfile
= gzip
.GzipFile(mode
='wb', compresslevel
=6, fileobj
=zbuf
)
177 return zbuf
.getvalue()
179 ustring_re
= re
.compile(u
"([\u0080-\uffff])")
181 def javascript_quote(s
, quote_double_quotes
=False):
184 return r
"\u%04x" % ord(match
.group(1))
187 s
= s
.decode('utf-8')
188 elif type(s
) != unicode:
190 s
= s
.replace('\\', '\\\\')
191 s
= s
.replace('\r', '\\r')
192 s
= s
.replace('\n', '\\n')
193 s
= s
.replace('\t', '\\t')
194 s
= s
.replace("'", "\\'")
195 if quote_double_quotes
:
196 s
= s
.replace('"', '"')
197 return str(ustring_re
.sub(fix
, s
))
198 javascript_quote
= allow_lazy(javascript_quote
, unicode)
200 smart_split_re
= re
.compile('("(?:[^"\\\\]*(?:\\\\.[^"\\\\]*)*)"|\'(?:[^\'\\\\]*(?:\\\\.[^\'\\\\]*)*)\'|[^\\s]+)')
201 def smart_split(text
):
203 Generator that splits a string by spaces, leaving quoted phrases together.
204 Supports both single and double quotes, and supports escaping quotes with
205 backslashes. In the output, strings will keep their initial and trailing
208 >>> list(smart_split(r'This is "a person\'s" test.'))
209 [u'This', u'is', u'"a person\\\'s"', u'test.']
210 >>> list(smart_split(r"Another 'person\'s' test."))
211 [u'Another', u"'person's'", u'test.']
212 >>> list(smart_split(r'A "\"funky
\" style
" test.'))
213 [u'A', u'""funky" style
"', u'test.']
215 text = force_unicode(text)
216 for bit in smart_split_re.finditer(text):
218 if bit[0] == '"' and bit[-1] == '"':
219 yield '"' + bit[1:-1].replace('\\"', '"').replace('\\\\', '\\') + '"'
220 elif bit[0] == "'" and bit[-1] == "'":
221 yield "'" + bit[1:-1].replace("\\'", "'").replace("\\\\", "\\") + "'"
224 smart_split = allow_lazy(smart_split, unicode)
226 def _replace_entity(match):
227 text = match.group(1)
232 c = int(text[1:], 16)
237 return match.group(0)
240 return unichr(name2codepoint[text])
241 except (ValueError, KeyError):
242 return match.group(0)
244 _entity_re = re.compile(r"&(#?[xX]?(?:[0-9a-fA-F]+|\w{1,8}));")
246 def unescape_entities(text
):
247 return _entity_re
.sub(_replace_entity
, text
)
248 unescape_entities
= allow_lazy(unescape_entities
, unicode)