Fixed #8516: Corrected typos in UK localflavor documentation
[django.git] / django / utils / text.py
blob3686a454a8c7eff5119ad9df2f8cea26d8ebacc5
1 import re
2 from django.conf import settings
3 from django.utils.encoding import force_unicode
4 from django.utils.functional import allow_lazy
5 from django.utils.translation import ugettext_lazy
6 from htmlentitydefs import name2codepoint
8 # Capitalizes the first letter of a string.
9 capfirst = lambda x: x and force_unicode(x)[0].upper() + force_unicode(x)[1:]
10 capfirst = allow_lazy(capfirst, unicode)
12 def wrap(text, width):
13 """
14 A word-wrap function that preserves existing line breaks and most spaces in
15 the text. Expects that existing line breaks are posix newlines.
16 """
17 text = force_unicode(text)
18 def _generator():
19 it = iter(text.split(' '))
20 word = it.next()
21 yield word
22 pos = len(word) - word.rfind('\n') - 1
23 for word in it:
24 if "\n" in word:
25 lines = word.split('\n')
26 else:
27 lines = (word,)
28 pos += len(lines[0]) + 1
29 if pos > width:
30 yield '\n'
31 pos = len(lines[-1])
32 else:
33 yield ' '
34 if len(lines) > 1:
35 pos = len(lines[-1])
36 yield word
37 return u''.join(_generator())
38 wrap = allow_lazy(wrap, unicode)
40 def truncate_words(s, num):
41 "Truncates a string after a certain number of words."
42 s = force_unicode(s)
43 length = int(num)
44 words = s.split()
45 if len(words) > length:
46 words = words[:length]
47 if not words[-1].endswith('...'):
48 words.append('...')
49 return u' '.join(words)
50 truncate_words = allow_lazy(truncate_words, unicode)
52 def truncate_html_words(s, num):
53 """
54 Truncates html to a certain number of words (not counting tags and
55 comments). Closes opened tags if they were correctly closed in the given
56 html.
57 """
58 s = force_unicode(s)
59 length = int(num)
60 if length <= 0:
61 return u''
62 html4_singlets = ('br', 'col', 'link', 'base', 'img', 'param', 'area', 'hr', 'input')
63 # Set up regular expressions
64 re_words = re.compile(r'&.*?;|<.*?>|(\w[\w-]*)', re.U)
65 re_tag = re.compile(r'<(/)?([^ ]+?)(?: (/)| .*?)?>')
66 # Count non-HTML words and keep note of open tags
67 pos = 0
68 ellipsis_pos = 0
69 words = 0
70 open_tags = []
71 while words <= length:
72 m = re_words.search(s, pos)
73 if not m:
74 # Checked through whole string
75 break
76 pos = m.end(0)
77 if m.group(1):
78 # It's an actual non-HTML word
79 words += 1
80 if words == length:
81 ellipsis_pos = pos
82 continue
83 # Check for tag
84 tag = re_tag.match(m.group(0))
85 if not tag or ellipsis_pos:
86 # Don't worry about non tags or tags after our truncate point
87 continue
88 closing_tag, tagname, self_closing = tag.groups()
89 tagname = tagname.lower() # Element names are always case-insensitive
90 if self_closing or tagname in html4_singlets:
91 pass
92 elif closing_tag:
93 # Check for match in open tags list
94 try:
95 i = open_tags.index(tagname)
96 except ValueError:
97 pass
98 else:
99 # SGML: An end tag closes, back to the matching start tag, all unclosed intervening start tags with omitted end tags
100 open_tags = open_tags[i+1:]
101 else:
102 # Add it to the start of the open tags list
103 open_tags.insert(0, tagname)
104 if words <= length:
105 # Don't try to close tags if we don't need to truncate
106 return s
107 out = s[:ellipsis_pos] + ' ...'
108 # Close any tags still open
109 for tag in open_tags:
110 out += '</%s>' % tag
111 # Return string
112 return out
113 truncate_html_words = allow_lazy(truncate_html_words, unicode)
115 def get_valid_filename(s):
117 Returns the given string converted to a string that can be used for a clean
118 filename. Specifically, leading and trailing spaces are removed; other
119 spaces are converted to underscores; and all non-filename-safe characters
120 are removed.
121 >>> get_valid_filename("john's portrait in 2004.jpg")
122 u'johns_portrait_in_2004.jpg'
124 s = force_unicode(s).strip().replace(' ', '_')
125 return re.sub(r'[^-A-Za-z0-9_.]', '', s)
126 get_valid_filename = allow_lazy(get_valid_filename, unicode)
128 def get_text_list(list_, last_word=ugettext_lazy(u'or')):
130 >>> get_text_list(['a', 'b', 'c', 'd'])
131 u'a, b, c or d'
132 >>> get_text_list(['a', 'b', 'c'], 'and')
133 u'a, b and c'
134 >>> get_text_list(['a', 'b'], 'and')
135 u'a and b'
136 >>> get_text_list(['a'])
137 u'a'
138 >>> get_text_list([])
141 if len(list_) == 0: return u''
142 if len(list_) == 1: return force_unicode(list_[0])
143 return u'%s %s %s' % (', '.join([force_unicode(i) for i in list_][:-1]), force_unicode(last_word), force_unicode(list_[-1]))
144 get_text_list = allow_lazy(get_text_list, unicode)
146 def normalize_newlines(text):
147 return force_unicode(re.sub(r'\r\n|\r|\n', '\n', text))
148 normalize_newlines = allow_lazy(normalize_newlines, unicode)
150 def recapitalize(text):
151 "Recapitalizes text, placing caps after end-of-sentence punctuation."
152 text = force_unicode(text).lower()
153 capsRE = re.compile(r'(?:^|(?<=[\.\?\!] ))([a-z])')
154 text = capsRE.sub(lambda x: x.group(1).upper(), text)
155 return text
156 recapitalize = allow_lazy(recapitalize)
158 def phone2numeric(phone):
159 "Converts a phone number with letters into its numeric equivalent."
160 letters = re.compile(r'[A-PR-Y]', re.I)
161 char2number = lambda m: {'a': '2', 'c': '2', 'b': '2', 'e': '3',
162 'd': '3', 'g': '4', 'f': '3', 'i': '4', 'h': '4', 'k': '5',
163 'j': '5', 'm': '6', 'l': '5', 'o': '6', 'n': '6', 'p': '7',
164 's': '7', 'r': '7', 'u': '8', 't': '8', 'w': '9', 'v': '8',
165 'y': '9', 'x': '9'}.get(m.group(0).lower())
166 return letters.sub(char2number, phone)
167 phone2numeric = allow_lazy(phone2numeric)
169 # From http://www.xhaus.com/alan/python/httpcomp.html#gzip
170 # Used with permission.
171 def compress_string(s):
172 import cStringIO, gzip
173 zbuf = cStringIO.StringIO()
174 zfile = gzip.GzipFile(mode='wb', compresslevel=6, fileobj=zbuf)
175 zfile.write(s)
176 zfile.close()
177 return zbuf.getvalue()
179 ustring_re = re.compile(u"([\u0080-\uffff])")
181 def javascript_quote(s, quote_double_quotes=False):
183 def fix(match):
184 return r"\u%04x" % ord(match.group(1))
186 if type(s) == str:
187 s = s.decode('utf-8')
188 elif type(s) != unicode:
189 raise TypeError, s
190 s = s.replace('\\', '\\\\')
191 s = s.replace('\r', '\\r')
192 s = s.replace('\n', '\\n')
193 s = s.replace('\t', '\\t')
194 s = s.replace("'", "\\'")
195 if quote_double_quotes:
196 s = s.replace('"', '&quot;')
197 return str(ustring_re.sub(fix, s))
198 javascript_quote = allow_lazy(javascript_quote, unicode)
200 smart_split_re = re.compile('("(?:[^"\\\\]*(?:\\\\.[^"\\\\]*)*)"|\'(?:[^\'\\\\]*(?:\\\\.[^\'\\\\]*)*)\'|[^\\s]+)')
201 def smart_split(text):
202 r"""
203 Generator that splits a string by spaces, leaving quoted phrases together.
204 Supports both single and double quotes, and supports escaping quotes with
205 backslashes. In the output, strings will keep their initial and trailing
206 quote marks.
208 >>> list(smart_split(r'This is "a person\'s" test.'))
209 [u'This', u'is', u'"a person\\\'s"', u'test.']
210 >>> list(smart_split(r"Another 'person\'s' test."))
211 [u'Another', u"'person's'", u'test.']
212 >>> list(smart_split(r'A "\"funky\" style" test.'))
213 [u'A', u'""funky" style"', u'test.']
215 text = force_unicode(text)
216 for bit in smart_split_re.finditer(text):
217 bit = bit.group(0)
218 if bit[0] == '"' and bit[-1] == '"':
219 yield '"' + bit[1:-1].replace('\\"', '"').replace('\\\\', '\\') + '"'
220 elif bit[0] == "'" and bit[-1] == "'":
221 yield "'" + bit[1:-1].replace("\\'", "'").replace("\\\\", "\\") + "'"
222 else:
223 yield bit
224 smart_split = allow_lazy(smart_split, unicode)
226 def _replace_entity(match):
227 text = match.group(1)
228 if text[0] == u'#':
229 text = text[1:]
230 try:
231 if text[0] in u'xX':
232 c = int(text[1:], 16)
233 else:
234 c = int(text)
235 return unichr(c)
236 except ValueError:
237 return match.group(0)
238 else:
239 try:
240 return unichr(name2codepoint[text])
241 except (ValueError, KeyError):
242 return match.group(0)
244 _entity_re = re.compile(r"&(#?[xX]?(?:[0-9a-fA-F]+|\w{1,8}));")
246 def unescape_entities(text):
247 return _entity_re.sub(_replace_entity, text)
248 unescape_entities = allow_lazy(unescape_entities, unicode)