indico/util/string.py

   1 # This file is part of Indico.
   2 # Copyright (C) 2002 - 2015 European Organization for Nuclear Research (CERN).
   3 #
   4 # Indico is free software; you can redistribute it and/or
   5 # modify it under the terms of the GNU General Public License as
   6 # published by the Free Software Foundation; either version 3 of the
   7 # License, or (at your option) any later version.
   8 #
   9 # Indico is distributed in the hope that it will be useful, but
  10 # WITHOUT ANY WARRANTY; without even the implied warranty of
  11 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  12 # General Public License for more details.
  13 #
  14 # You should have received a copy of the GNU General Public License
  15 # along with Indico; if not, see <http://www.gnu.org/licenses/>.
  16
  17 """
  18 String manipulation functions
  19 """
  20
  21 import functools
  22 import re
  23 import unicodedata
  24 from uuid import uuid4
  25
  26 import markdown
  27 import bleach
  28 from lxml import html, etree
  29 from speaklater import _LazyString
  30
  31 try:
  32     import translitcodec
  33 except ImportError:
  34     translitcodec = None
  35
  36
  37 BLEACH_ALLOWED_TAGS = bleach.ALLOWED_TAGS + ['sup', 'sub', 'small']
  38 LATEX_MATH_PLACEHOLDER = u"\uE000"
  39
  40
  41 def encode_if_unicode(s):
  42     if isinstance(s, _LazyString) and isinstance(s.value, unicode):
  43         s = unicode(s)
  44     return s.encode('utf-8') if isinstance(s, unicode) else s
  45
  46
  47 def unicodeOrNone(s):
  48     return None if s is None else s.decode('utf-8')
  49
  50
  51 def safe_upper(string):
  52     if isinstance(string, unicode):
  53         return string.upper()
  54     else:
  55         return string.decode('utf-8').upper().encode('utf-8')
  56
  57
  58 def safe_slice(string, start, stop=None):
  59     slice_ = slice(start, stop)
  60     if isinstance(string, unicode):
  61         return string[slice_]
  62     else:
  63         return string.decode('utf-8')[slice_].encode('utf-8')
  64
  65
  66 def remove_accents(text, reencode=True):
  67     if not isinstance(text, unicode):
  68         text = text.decode('utf-8')
  69     result = u''.join((c for c in unicodedata.normalize('NFD', text) if unicodedata.category(c) != 'Mn'))
  70     if reencode:
  71         return result.encode('utf-8')
  72     else:
  73         return result
  74
  75
  76 def fix_broken_string(text, as_unicode=False):
  77     try:
  78         text = text.decode('utf-8')
  79     except UnicodeDecodeError:
  80         try:
  81             text = text.decode('latin1')
  82         except UnicodeDecodeError:
  83             text = unicode(text, 'utf-8', errors='replace')
  84     return text if as_unicode else text.encode('utf-8')
  85
  86
  87 def to_unicode(text):
  88     """Converts a string to unicode if it isn't already unicode."""
  89     if isinstance(text, unicode):
  90         return text
  91     elif not isinstance(text, str):
  92         return unicode(text)
  93     return fix_broken_string(text, as_unicode=True)
  94
  95
  96 def fix_broken_obj(obj):
  97     if isinstance(obj, dict):
  98         return dict((k, fix_broken_obj(v)) for k, v in obj.iteritems())
  99     elif isinstance(obj, list):
 100         return map(fix_broken_obj, obj)
 101     elif isinstance(obj, str):
 102         return fix_broken_string(obj)
 103     elif isinstance(obj, unicode):
 104         pass  # nothing to do
 105     else:
 106         raise ValueError('Invalid object type in fix_broken_obj: {0}'.format(type(obj)))
 107
 108
 109 def remove_non_alpha(text):
 110     return ''.join(c for c in text if c.isalnum())
 111
 112
 113 def unicode_to_ascii(text):
 114     if not isinstance(text, unicode):
 115         return text
 116     elif translitcodec:
 117         text = text.encode('translit/long')
 118     else:
 119         text = unicodedata.normalize('NFKD', text)
 120     return text.encode('ascii', 'ignore')
 121
 122
 123 def unicode_struct_to_utf8(obj):
 124     if isinstance(obj, unicode):
 125         return obj.encode('utf-8', 'replace')
 126     elif isinstance(obj, list):
 127         return map(unicode_struct_to_utf8, obj)
 128     elif isinstance(obj, dict):
 129         return {unicode_struct_to_utf8(k): unicode_struct_to_utf8(v) for k, v in obj.iteritems()}
 130     return obj
 131
 132
 133 def return_ascii(f):
 134     """Decorator to normalize all unicode characters.
 135
 136     This is useful for __repr__ methods which **MUST** return a plain string to
 137     avoid encoding to utf8 or ascii all the time."""
 138     @functools.wraps(f)
 139     def wrapper(*args, **kwargs):
 140         return unicode_to_ascii(f(*args, **kwargs))
 141     return wrapper
 142
 143
 144 def html_line_breaks(text):
 145     return '<p>' + text.replace('\n\n', '</p><p>').replace('\n', '<br/>') + '</p>'
 146
 147
 148 def truncate(text, max_size, ellipsis='...', encoding='utf-8'):
 149     """
 150     Truncate text, taking unicode chars into account
 151     """
 152     encode = False
 153
 154     if isinstance(text, str):
 155         encode = True
 156         text = text.decode(encoding)
 157
 158     if len(text) > max_size:
 159         text = text[:max_size] + ellipsis
 160
 161     if encode:
 162         text = text.encode(encoding)
 163
 164     return text
 165
 166
 167 def permissive_format(text, params):
 168     """
 169     Format text using params from dictionary. Function is resistant to missing parentheses
 170     """
 171     for k, v in params.iteritems():
 172         text = text.replace("{" + k + "}", str(v))
 173     return text
 174
 175
 176 def remove_extra_spaces(text):
 177     """
 178     Removes multiple spaces within text and removes whitespace around the text
 179     'Text     with    spaces ' becomes 'Text with spaces'
 180     """
 181     pattern = re.compile(r"  +")
 182     return pattern.sub(' ', text).strip()
 183
 184
 185 def remove_tags(text):
 186     """
 187     Removes html-like tags from given text. Tag names aren't checked,
 188     <no-valid-tag></no-valid-tag> pair will be removed.
 189     """
 190     pattern = re.compile(r"<(\w|\/)[^<\s\"']*?>")
 191     return remove_extra_spaces(pattern.sub(' ', text))
 192
 193
 194 def render_markdown(text, escape_latex_math=True, md=None, **kwargs):
 195     """ Mako markdown to HTML filter
 196         :param text: Markdown source to convert to HTML
 197         :param escape_latex_math: Whether math expression should
 198                                   be left untouched
 199         :param md: An alternative markdown processor (can be used
 200                    to generate e.g. a different format)
 201         :param kwargs: Extra arguments to pass on to the markdown
 202                        processor
 203     """
 204     if escape_latex_math:
 205         math_segments = []
 206
 207         def _math_replace(m):
 208             math_segments.append(m.group(0))
 209             return LATEX_MATH_PLACEHOLDER
 210
 211         text = re.sub(r'\$[^\$]+\$|\$\$(^\$)\$\$', _math_replace, to_unicode(text))
 212
 213     if md is None:
 214         result = markdown.markdown(bleach.clean(text, tags=BLEACH_ALLOWED_TAGS), **kwargs)
 215     else:
 216         result = md(text, **kwargs)
 217
 218     if escape_latex_math:
 219         return re.sub(LATEX_MATH_PLACEHOLDER, lambda _: math_segments.pop(0), result)
 220     else:
 221         return result
 222
 223
 224 def sanitize_for_platypus(text):
 225     """Sanitize HTML to be used in platypus"""
 226     tags = ['b', 'br', 'em', 'font', 'i', 'img', 'strike', 'strong', 'sub', 'sup', 'u', 'span', 'div', 'p']
 227     attrs = {
 228         'font': ['size', 'face', 'color'],
 229         'img': ['src', 'width', 'height', 'valign']
 230     }
 231     res = bleach.clean(text, tags=tags, attributes=attrs, strip=True)
 232     # Convert to XHTML
 233     doc = html.fromstring(res)
 234     return etree.tostring(doc)
 235
 236
 237 # TODO: reference implementation from MaKaC
 238 # but, it's not totally correct according to RFC, see test cases
 239 # However, this regex is pretty good in term of practicality
 240 # but it may be updated to cover all cases
 241 VALID_EMAIL_REGEX = re.compile(r"""[-a-zA-Z0-9!#$%&'*+/=?\^_`{|}~]+
 242                                    (?:.[-a-zA-Z0-9!#$%&'*+/=?^_`{|}~]+)*
 243                                    @
 244                                    (?:[a-zA-Z0-9](?:[-a-zA-Z0-9]*[a-zA-Z0-9])?.)+
 245                                    [a-zA-Z0-9](?:[-a-zA-Z0-9]*[a-zA-Z0-9])?""", re.X)
 246
 247
 248 def is_valid_mail(emails_string, multi=True):
 249     """
 250     Checks the validity of an email address or a series of email addresses
 251
 252     - emails_string: a string representing a single email address or several
 253     email addresses separated by separators
 254     - multi: flag if multiple email addresses are allowed
 255
 256     Returns True if emails are valid.
 257     """
 258
 259     emails = re.split(r'[\s;,]+', emails_string)
 260
 261     if not multi and len(emails) > 1:
 262         return False
 263
 264     return all(re.match(VALID_EMAIL_REGEX, email) for email in emails if email)
 265
 266
 267 def natural_sort_key(s, _nsre=re.compile('([0-9]+)')):
 268     return [int(text) if text.isdigit() else text.lower() for text in re.split(_nsre, s)]
 269
 270
 271 def seems_html(string):
 272     return re.search(r'<[a-z]+?>', string) is not None
 273
 274
 275 def strip_control_chars(string):
 276     return re.sub(r'[\x0B-\x1F]', '', string)
 277
 278
 279 def html_color_to_rgb(hexcolor):
 280     """
 281     convert #RRGGBB to an (R, G, B) tuple
 282     """
 283
 284     if not hexcolor.startswith('#'):
 285         raise ValueError("Invalid color string '{}' (should start with '#')".format(hexcolor))
 286
 287     hexcolor = hexcolor[1:]
 288
 289     if len(hexcolor) not in {3, 6}:
 290         raise ValueError("'#{}'' is not in #RRGGBB or #RGB format".format(hexcolor))
 291
 292     if len(hexcolor) == 3:
 293         hexcolor = ''.join(c * 2 for c in hexcolor)
 294
 295     return tuple(float(int(hexcolor[i:i + 2], 16)) / 255 for i in range(0, 6, 2))
 296
 297
 298 def strip_whitespace(s):
 299     """Removes trailing/leading whitespace if a string was passed.
 300
 301     This utility is useful in cases where you might get None or
 302     non-string values such as WTForms filters.
 303     """
 304     if isinstance(s, basestring):
 305         s = s.strip()
 306     return s
 307
 308
 309 def make_unique_token(is_unique):
 310     """Create a unique UUID4-based token
 311
 312     :param is_unique: a callable invoked with the token which should
 313                       return a boolean indicating if the token is actually
 314     """
 315     token = unicode(uuid4())
 316     while not is_unique(token):
 317         token = unicode(uuid4())
 318     return token
 319
 320
 321 def encode_utf8(f):
 322     @functools.wraps(f)
 323     def _wrapper(*args, **kwargs):
 324         rv = f(*args, **kwargs)
 325         if not rv:
 326             return ''
 327         return rv.encode('utf-8') if isinstance(rv, unicode) else str(rv)
 328
 329     return _wrapper
 330
 331
 332 def is_legacy_id(id_):
 333     """Checks if an ID is a broken legacy ID.
 334
 335     These IDs are not compatible with new code since they are not
 336     numeric or have a leading zero, resulting in different objects
 337     with the same numeric id.
 338     """
 339     return not isinstance(id_, (int, long)) and (not id_.isdigit() or str(int(id_)) != id_)