Fix day filter
[cds-indico.git] / indico / util / string.py
blob6caf27a76dbef0f52e63642eb5c5d1641cf687ed
1 # This file is part of Indico.
2 # Copyright (C) 2002 - 2015 European Organization for Nuclear Research (CERN).
4 # Indico is free software; you can redistribute it and/or
5 # modify it under the terms of the GNU General Public License as
6 # published by the Free Software Foundation; either version 3 of the
7 # License, or (at your option) any later version.
9 # Indico is distributed in the hope that it will be useful, but
10 # WITHOUT ANY WARRANTY; without even the implied warranty of
11 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
12 # General Public License for more details.
14 # You should have received a copy of the GNU General Public License
15 # along with Indico; if not, see <http://www.gnu.org/licenses/>.
17 """
18 String manipulation functions
19 """
21 import functools
22 import re
23 import unicodedata
24 from uuid import uuid4
26 import markdown
27 import bleach
28 from lxml import html, etree
29 from speaklater import _LazyString
31 try:
32 import translitcodec
33 except ImportError:
34 translitcodec = None
37 BLEACH_ALLOWED_TAGS = bleach.ALLOWED_TAGS + ['sup', 'sub', 'small']
38 LATEX_MATH_PLACEHOLDER = u"\uE000"
41 def encode_if_unicode(s):
42 if isinstance(s, _LazyString) and isinstance(s.value, unicode):
43 s = unicode(s)
44 return s.encode('utf-8') if isinstance(s, unicode) else s
47 def unicodeOrNone(s):
48 return None if s is None else s.decode('utf-8')
51 def safe_upper(string):
52 if isinstance(string, unicode):
53 return string.upper()
54 else:
55 return string.decode('utf-8').upper().encode('utf-8')
58 def safe_slice(string, start, stop=None):
59 slice_ = slice(start, stop)
60 if isinstance(string, unicode):
61 return string[slice_]
62 else:
63 return string.decode('utf-8')[slice_].encode('utf-8')
66 def remove_accents(text, reencode=True):
67 if not isinstance(text, unicode):
68 text = text.decode('utf-8')
69 result = u''.join((c for c in unicodedata.normalize('NFD', text) if unicodedata.category(c) != 'Mn'))
70 if reencode:
71 return result.encode('utf-8')
72 else:
73 return result
76 def fix_broken_string(text, as_unicode=False):
77 try:
78 text = text.decode('utf-8')
79 except UnicodeDecodeError:
80 try:
81 text = text.decode('latin1')
82 except UnicodeDecodeError:
83 text = unicode(text, 'utf-8', errors='replace')
84 return text if as_unicode else text.encode('utf-8')
87 def to_unicode(text):
88 """Converts a string to unicode if it isn't already unicode."""
89 if isinstance(text, unicode):
90 return text
91 elif not isinstance(text, str):
92 return unicode(text)
93 return fix_broken_string(text, as_unicode=True)
96 def fix_broken_obj(obj):
97 if isinstance(obj, dict):
98 return dict((k, fix_broken_obj(v)) for k, v in obj.iteritems())
99 elif isinstance(obj, list):
100 return map(fix_broken_obj, obj)
101 elif isinstance(obj, str):
102 return fix_broken_string(obj)
103 elif isinstance(obj, unicode):
104 pass # nothing to do
105 else:
106 raise ValueError('Invalid object type in fix_broken_obj: {0}'.format(type(obj)))
109 def remove_non_alpha(text):
110 return ''.join(c for c in text if c.isalnum())
113 def unicode_to_ascii(text):
114 if not isinstance(text, unicode):
115 return text
116 elif translitcodec:
117 text = text.encode('translit/long')
118 else:
119 text = unicodedata.normalize('NFKD', text)
120 return text.encode('ascii', 'ignore')
123 def unicode_struct_to_utf8(obj):
124 if isinstance(obj, unicode):
125 return obj.encode('utf-8', 'replace')
126 elif isinstance(obj, list):
127 return map(unicode_struct_to_utf8, obj)
128 elif isinstance(obj, dict):
129 return {unicode_struct_to_utf8(k): unicode_struct_to_utf8(v) for k, v in obj.iteritems()}
130 return obj
133 def return_ascii(f):
134 """Decorator to normalize all unicode characters.
136 This is useful for __repr__ methods which **MUST** return a plain string to
137 avoid encoding to utf8 or ascii all the time."""
138 @functools.wraps(f)
139 def wrapper(*args, **kwargs):
140 return unicode_to_ascii(f(*args, **kwargs))
141 return wrapper
144 def html_line_breaks(text):
145 return '<p>' + text.replace('\n\n', '</p><p>').replace('\n', '<br/>') + '</p>'
148 def truncate(text, max_size, ellipsis='...', encoding='utf-8'):
150 Truncate text, taking unicode chars into account
152 encode = False
154 if isinstance(text, str):
155 encode = True
156 text = text.decode(encoding)
158 if len(text) > max_size:
159 text = text[:max_size] + ellipsis
161 if encode:
162 text = text.encode(encoding)
164 return text
167 def permissive_format(text, params):
169 Format text using params from dictionary. Function is resistant to missing parentheses
171 for k, v in params.iteritems():
172 text = text.replace("{" + k + "}", str(v))
173 return text
176 def remove_extra_spaces(text):
178 Removes multiple spaces within text and removes whitespace around the text
179 'Text with spaces ' becomes 'Text with spaces'
181 pattern = re.compile(r" +")
182 return pattern.sub(' ', text).strip()
185 def remove_tags(text):
187 Removes html-like tags from given text. Tag names aren't checked,
188 <no-valid-tag></no-valid-tag> pair will be removed.
190 pattern = re.compile(r"<(\w|\/)[^<\s\"']*?>")
191 return remove_extra_spaces(pattern.sub(' ', text))
194 def render_markdown(text, escape_latex_math=True, md=None, **kwargs):
195 """ Mako markdown to HTML filter
196 :param text: Markdown source to convert to HTML
197 :param escape_latex_math: Whether math expression should
198 be left untouched
199 :param md: An alternative markdown processor (can be used
200 to generate e.g. a different format)
201 :param kwargs: Extra arguments to pass on to the markdown
202 processor
204 if escape_latex_math:
205 math_segments = []
207 def _math_replace(m):
208 math_segments.append(m.group(0))
209 return LATEX_MATH_PLACEHOLDER
211 text = re.sub(r'\$[^\$]+\$|\$\$(^\$)\$\$', _math_replace, to_unicode(text))
213 if md is None:
214 result = markdown.markdown(bleach.clean(text, tags=BLEACH_ALLOWED_TAGS), **kwargs)
215 else:
216 result = md(text, **kwargs)
218 if escape_latex_math:
219 return re.sub(LATEX_MATH_PLACEHOLDER, lambda _: math_segments.pop(0), result)
220 else:
221 return result
224 def sanitize_for_platypus(text):
225 """Sanitize HTML to be used in platypus"""
226 tags = ['b', 'br', 'em', 'font', 'i', 'img', 'strike', 'strong', 'sub', 'sup', 'u', 'span', 'div', 'p']
227 attrs = {
228 'font': ['size', 'face', 'color'],
229 'img': ['src', 'width', 'height', 'valign']
231 res = bleach.clean(text, tags=tags, attributes=attrs, strip=True)
232 # Convert to XHTML
233 doc = html.fromstring(res)
234 return etree.tostring(doc)
237 # TODO: reference implementation from MaKaC
238 # but, it's not totally correct according to RFC, see test cases
239 # However, this regex is pretty good in term of practicality
240 # but it may be updated to cover all cases
241 VALID_EMAIL_REGEX = re.compile(r"""[-a-zA-Z0-9!#$%&'*+/=?\^_`{|}~]+
242 (?:.[-a-zA-Z0-9!#$%&'*+/=?^_`{|}~]+)*
244 (?:[a-zA-Z0-9](?:[-a-zA-Z0-9]*[a-zA-Z0-9])?.)+
245 [a-zA-Z0-9](?:[-a-zA-Z0-9]*[a-zA-Z0-9])?""", re.X)
248 def is_valid_mail(emails_string, multi=True):
250 Checks the validity of an email address or a series of email addresses
252 - emails_string: a string representing a single email address or several
253 email addresses separated by separators
254 - multi: flag if multiple email addresses are allowed
256 Returns True if emails are valid.
259 emails = re.split(r'[\s;,]+', emails_string)
261 if not multi and len(emails) > 1:
262 return False
264 return all(re.match(VALID_EMAIL_REGEX, email) for email in emails if email)
267 def natural_sort_key(s, _nsre=re.compile('([0-9]+)')):
268 return [int(text) if text.isdigit() else text.lower() for text in re.split(_nsre, s)]
271 def seems_html(string):
272 return re.search(r'<[a-z]+?>', string) is not None
275 def strip_control_chars(string):
276 return re.sub(r'[\x0B-\x1F]', '', string)
279 def html_color_to_rgb(hexcolor):
281 convert #RRGGBB to an (R, G, B) tuple
284 if not hexcolor.startswith('#'):
285 raise ValueError("Invalid color string '{}' (should start with '#')".format(hexcolor))
287 hexcolor = hexcolor[1:]
289 if len(hexcolor) not in {3, 6}:
290 raise ValueError("'#{}'' is not in #RRGGBB or #RGB format".format(hexcolor))
292 if len(hexcolor) == 3:
293 hexcolor = ''.join(c * 2 for c in hexcolor)
295 return tuple(float(int(hexcolor[i:i + 2], 16)) / 255 for i in range(0, 6, 2))
298 def strip_whitespace(s):
299 """Removes trailing/leading whitespace if a string was passed.
301 This utility is useful in cases where you might get None or
302 non-string values such as WTForms filters.
304 if isinstance(s, basestring):
305 s = s.strip()
306 return s
309 def make_unique_token(is_unique):
310 """Create a unique UUID4-based token
312 :param is_unique: a callable invoked with the token which should
313 return a boolean indicating if the token is actually
315 token = unicode(uuid4())
316 while not is_unique(token):
317 token = unicode(uuid4())
318 return token
321 def encode_utf8(f):
322 @functools.wraps(f)
323 def _wrapper(*args, **kwargs):
324 rv = f(*args, **kwargs)
325 if not rv:
326 return ''
327 return rv.encode('utf-8') if isinstance(rv, unicode) else str(rv)
329 return _wrapper
332 def is_legacy_id(id_):
333 """Checks if an ID is a broken legacy ID.
335 These IDs are not compatible with new code since they are not
336 numeric or have a leading zero, resulting in different objects
337 with the same numeric id.
339 return not isinstance(id_, (int, long)) and (not id_.isdigit() or str(int(id_)) != id_)