django/utils/text.py

   1 import re
   2 from django.conf import settings
   3 from django.utils.encoding import force_unicode
   4 from django.utils.functional import allow_lazy
   5 from django.utils.translation import ugettext_lazy
   6 from htmlentitydefs import name2codepoint
   7
   8 # Capitalizes the first letter of a string.
   9 capfirst = lambda x: x and force_unicode(x)[0].upper() + force_unicode(x)[1:]
  10 capfirst = allow_lazy(capfirst, unicode)
  11
  12 def wrap(text, width):
  13     """
  14     A word-wrap function that preserves existing line breaks and most spaces in
  15     the text. Expects that existing line breaks are posix newlines.
  16     """
  17     text = force_unicode(text)
  18     def _generator():
  19         it = iter(text.split(' '))
  20         word = it.next()
  21         yield word
  22         pos = len(word) - word.rfind('\n') - 1
  23         for word in it:
  24             if "\n" in word:
  25                 lines = word.split('\n')
  26             else:
  27                 lines = (word,)
  28             pos += len(lines[0]) + 1
  29             if pos > width:
  30                 yield '\n'
  31                 pos = len(lines[-1])
  32             else:
  33                 yield ' '
  34                 if len(lines) > 1:
  35                     pos = len(lines[-1])
  36             yield word
  37     return u''.join(_generator())
  38 wrap = allow_lazy(wrap, unicode)
  39
  40 def truncate_words(s, num):
  41     "Truncates a string after a certain number of words."
  42     s = force_unicode(s)
  43     length = int(num)
  44     words = s.split()
  45     if len(words) > length:
  46         words = words[:length]
  47         if not words[-1].endswith('...'):
  48             words.append('...')
  49     return u' '.join(words)
  50 truncate_words = allow_lazy(truncate_words, unicode)
  51
  52 def truncate_html_words(s, num):
  53     """
  54     Truncates html to a certain number of words (not counting tags and
  55     comments). Closes opened tags if they were correctly closed in the given
  56     html.
  57     """
  58     s = force_unicode(s)
  59     length = int(num)
  60     if length <= 0:
  61         return u''
  62     html4_singlets = ('br', 'col', 'link', 'base', 'img', 'param', 'area', 'hr', 'input')
  63     # Set up regular expressions
  64     re_words = re.compile(r'&.*?;|<.*?>|(\w[\w-]*)', re.U)
  65     re_tag = re.compile(r'<(/)?([^ ]+?)(?: (/)| .*?)?>')
  66     # Count non-HTML words and keep note of open tags
  67     pos = 0
  68     ellipsis_pos = 0
  69     words = 0
  70     open_tags = []
  71     while words <= length:
  72         m = re_words.search(s, pos)
  73         if not m:
  74             # Checked through whole string
  75             break
  76         pos = m.end(0)
  77         if m.group(1):
  78             # It's an actual non-HTML word
  79             words += 1
  80             if words == length:
  81                 ellipsis_pos = pos
  82             continue
  83         # Check for tag
  84         tag = re_tag.match(m.group(0))
  85         if not tag or ellipsis_pos:
  86             # Don't worry about non tags or tags after our truncate point
  87             continue
  88         closing_tag, tagname, self_closing = tag.groups()
  89         tagname = tagname.lower()  # Element names are always case-insensitive
  90         if self_closing or tagname in html4_singlets:
  91             pass
  92         elif closing_tag:
  93             # Check for match in open tags list
  94             try:
  95                 i = open_tags.index(tagname)
  96             except ValueError:
  97                 pass
  98             else:
  99                 # SGML: An end tag closes, back to the matching start tag, all unclosed intervening start tags with omitted end tags
 100                 open_tags = open_tags[i+1:]
 101         else:
 102             # Add it to the start of the open tags list
 103             open_tags.insert(0, tagname)
 104     if words <= length:
 105         # Don't try to close tags if we don't need to truncate
 106         return s
 107     out = s[:ellipsis_pos] + ' ...'
 108     # Close any tags still open
 109     for tag in open_tags:
 110         out += '</%s>' % tag
 111     # Return string
 112     return out
 113 truncate_html_words = allow_lazy(truncate_html_words, unicode)
 114
 115 def get_valid_filename(s):
 116     """
 117     Returns the given string converted to a string that can be used for a clean
 118     filename. Specifically, leading and trailing spaces are removed; other
 119     spaces are converted to underscores; and all non-filename-safe characters
 120     are removed.
 121     >>> get_valid_filename("john's portrait in 2004.jpg")
 122     u'johns_portrait_in_2004.jpg'
 123     """
 124     s = force_unicode(s).strip().replace(' ', '_')
 125     return re.sub(r'[^-A-Za-z0-9_.]', '', s)
 126 get_valid_filename = allow_lazy(get_valid_filename, unicode)
 127
 128 def get_text_list(list_, last_word=ugettext_lazy(u'or')):
 129     """
 130     >>> get_text_list(['a', 'b', 'c', 'd'])
 131     u'a, b, c or d'
 132     >>> get_text_list(['a', 'b', 'c'], 'and')
 133     u'a, b and c'
 134     >>> get_text_list(['a', 'b'], 'and')
 135     u'a and b'
 136     >>> get_text_list(['a'])
 137     u'a'
 138     >>> get_text_list([])
 139     u''
 140     """
 141     if len(list_) == 0: return u''
 142     if len(list_) == 1: return force_unicode(list_[0])
 143     return u'%s %s %s' % (', '.join([force_unicode(i) for i in list_][:-1]), force_unicode(last_word), force_unicode(list_[-1]))
 144 get_text_list = allow_lazy(get_text_list, unicode)
 145
 146 def normalize_newlines(text):
 147     return force_unicode(re.sub(r'\r\n|\r|\n', '\n', text))
 148 normalize_newlines = allow_lazy(normalize_newlines, unicode)
 149
 150 def recapitalize(text):
 151     "Recapitalizes text, placing caps after end-of-sentence punctuation."
 152     text = force_unicode(text).lower()
 153     capsRE = re.compile(r'(?:^|(?<=[\.\?\!] ))([a-z])')
 154     text = capsRE.sub(lambda x: x.group(1).upper(), text)
 155     return text
 156 recapitalize = allow_lazy(recapitalize)
 157
 158 def phone2numeric(phone):
 159     "Converts a phone number with letters into its numeric equivalent."
 160     letters = re.compile(r'[A-PR-Y]', re.I)
 161     char2number = lambda m: {'a': '2', 'c': '2', 'b': '2', 'e': '3',
 162          'd': '3', 'g': '4', 'f': '3', 'i': '4', 'h': '4', 'k': '5',
 163          'j': '5', 'm': '6', 'l': '5', 'o': '6', 'n': '6', 'p': '7',
 164          's': '7', 'r': '7', 'u': '8', 't': '8', 'w': '9', 'v': '8',
 165          'y': '9', 'x': '9'}.get(m.group(0).lower())
 166     return letters.sub(char2number, phone)
 167 phone2numeric = allow_lazy(phone2numeric)
 168
 169 # From http://www.xhaus.com/alan/python/httpcomp.html#gzip
 170 # Used with permission.
 171 def compress_string(s):
 172     import cStringIO, gzip
 173     zbuf = cStringIO.StringIO()
 174     zfile = gzip.GzipFile(mode='wb', compresslevel=6, fileobj=zbuf)
 175     zfile.write(s)
 176     zfile.close()
 177     return zbuf.getvalue()
 178
 179 ustring_re = re.compile(u"([\u0080-\uffff])")
 180
 181 def javascript_quote(s, quote_double_quotes=False):
 182
 183     def fix(match):
 184         return r"\u%04x" % ord(match.group(1))
 185
 186     if type(s) == str:
 187         s = s.decode('utf-8')
 188     elif type(s) != unicode:
 189         raise TypeError, s
 190     s = s.replace('\\', '\\\\')
 191     s = s.replace('\r', '\\r')
 192     s = s.replace('\n', '\\n')
 193     s = s.replace('\t', '\\t')
 194     s = s.replace("'", "\\'")
 195     if quote_double_quotes:
 196         s = s.replace('"', '&quot;')
 197     return str(ustring_re.sub(fix, s))
 198 javascript_quote = allow_lazy(javascript_quote, unicode)
 199
 200 smart_split_re = re.compile('("(?:[^"\\\\]*(?:\\\\.[^"\\\\]*)*)"|\'(?:[^\'\\\\]*(?:\\\\.[^\'\\\\]*)*)\'|[^\\s]+)')
 201 def smart_split(text):
 202     r"""
 203     Generator that splits a string by spaces, leaving quoted phrases together.
 204     Supports both single and double quotes, and supports escaping quotes with
 205     backslashes. In the output, strings will keep their initial and trailing
 206     quote marks.
 207
 208     >>> list(smart_split(r'This is "a person\'s" test.'))
 209     [u'This', u'is', u'"a person\\\'s"', u'test.']
 210         >>> list(smart_split(r"Another 'person\'s' test."))
 211         [u'Another', u"'person's'", u'test.']
 212         >>> list(smart_split(r'A "\"funky\" style" test.'))
 213         [u'A', u'""funky" style"', u'test.']
 214     """
 215     text = force_unicode(text)
 216     for bit in smart_split_re.finditer(text):
 217         bit = bit.group(0)
 218         if bit[0] == '"' and bit[-1] == '"':
 219             yield '"' + bit[1:-1].replace('\\"', '"').replace('\\\\', '\\') + '"'
 220         elif bit[0] == "'" and bit[-1] == "'":
 221             yield "'" + bit[1:-1].replace("\\'", "'").replace("\\\\", "\\") + "'"
 222         else:
 223             yield bit
 224 smart_split = allow_lazy(smart_split, unicode)
 225
 226 def _replace_entity(match):
 227      text = match.group(1)
 228      if text[0] == u'#':
 229          text = text[1:]
 230          try:
 231              if text[0] in u'xX':
 232                  c = int(text[1:], 16)
 233              else:
 234                  c = int(text)
 235              return unichr(c)
 236          except ValueError:
 237              return match.group(0)
 238      else:
 239          try:
 240              return unichr(name2codepoint[text])
 241          except (ValueError, KeyError):
 242              return match.group(0)
 243
 244 _entity_re = re.compile(r"&(#?[xX]?(?:[0-9a-fA-F]+|\w{1,8}));")
 245
 246 def unescape_entities(text):
 247      return _entity_re.sub(_replace_entity, text)
 248 unescape_entities = allow_lazy(unescape_entities, unicode)