1 # This file is part of Indico.
2 # Copyright (C) 2002 - 2015 European Organization for Nuclear Research (CERN).
4 # Indico is free software; you can redistribute it and/or
5 # modify it under the terms of the GNU General Public License as
6 # published by the Free Software Foundation; either version 3 of the
7 # License, or (at your option) any later version.
9 # Indico is distributed in the hope that it will be useful, but
10 # WITHOUT ANY WARRANTY; without even the implied warranty of
11 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
12 # General Public License for more details.
14 # You should have received a copy of the GNU General Public License
15 # along with Indico; if not, see <http://www.gnu.org/licenses/>.
18 String manipulation functions
24 from uuid
import uuid4
28 from lxml
import html
, etree
29 from speaklater
import _LazyString
37 BLEACH_ALLOWED_TAGS
= bleach
.ALLOWED_TAGS
+ ['sup', 'sub', 'small']
38 LATEX_MATH_PLACEHOLDER
= u
"\uE000"
41 def encode_if_unicode(s
):
42 if isinstance(s
, _LazyString
) and isinstance(s
.value
, unicode):
44 return s
.encode('utf-8') if isinstance(s
, unicode) else s
48 return None if s
is None else s
.decode('utf-8')
51 def safe_upper(string
):
52 if isinstance(string
, unicode):
55 return string
.decode('utf-8').upper().encode('utf-8')
58 def safe_slice(string
, start
, stop
=None):
59 slice_
= slice(start
, stop
)
60 if isinstance(string
, unicode):
63 return string
.decode('utf-8')[slice_
].encode('utf-8')
66 def remove_accents(text
, reencode
=True):
67 if not isinstance(text
, unicode):
68 text
= text
.decode('utf-8')
69 result
= u
''.join((c
for c
in unicodedata
.normalize('NFD', text
) if unicodedata
.category(c
) != 'Mn'))
71 return result
.encode('utf-8')
76 def fix_broken_string(text
, as_unicode
=False):
78 text
= text
.decode('utf-8')
79 except UnicodeDecodeError:
81 text
= text
.decode('latin1')
82 except UnicodeDecodeError:
83 text
= unicode(text
, 'utf-8', errors
='replace')
84 return text
if as_unicode
else text
.encode('utf-8')
88 """Converts a string to unicode if it isn't already unicode."""
89 if isinstance(text
, unicode):
91 elif not isinstance(text
, str):
93 return fix_broken_string(text
, as_unicode
=True)
96 def fix_broken_obj(obj
):
97 if isinstance(obj
, dict):
98 return dict((k
, fix_broken_obj(v
)) for k
, v
in obj
.iteritems())
99 elif isinstance(obj
, list):
100 return map(fix_broken_obj
, obj
)
101 elif isinstance(obj
, str):
102 return fix_broken_string(obj
)
103 elif isinstance(obj
, unicode):
106 raise ValueError('Invalid object type in fix_broken_obj: {0}'.format(type(obj
)))
109 def remove_non_alpha(text
):
110 return ''.join(c
for c
in text
if c
.isalnum())
113 def unicode_to_ascii(text
):
114 if not isinstance(text
, unicode):
117 text
= text
.encode('translit/long')
119 text
= unicodedata
.normalize('NFKD', text
)
120 return text
.encode('ascii', 'ignore')
123 def unicode_struct_to_utf8(obj
):
124 if isinstance(obj
, unicode):
125 return obj
.encode('utf-8', 'replace')
126 elif isinstance(obj
, list):
127 return map(unicode_struct_to_utf8
, obj
)
128 elif isinstance(obj
, dict):
129 return {unicode_struct_to_utf8(k
): unicode_struct_to_utf8(v
) for k
, v
in obj
.iteritems()}
134 """Decorator to normalize all unicode characters.
136 This is useful for __repr__ methods which **MUST** return a plain string to
137 avoid encoding to utf8 or ascii all the time."""
139 def wrapper(*args
, **kwargs
):
140 return unicode_to_ascii(f(*args
, **kwargs
))
144 def html_line_breaks(text
):
145 return '<p>' + text
.replace('\n\n', '</p><p>').replace('\n', '<br/>') + '</p>'
148 def truncate(text
, max_size
, ellipsis
='...', encoding
='utf-8'):
150 Truncate text, taking unicode chars into account
154 if isinstance(text
, str):
156 text
= text
.decode(encoding
)
158 if len(text
) > max_size
:
159 text
= text
[:max_size
] + ellipsis
162 text
= text
.encode(encoding
)
167 def permissive_format(text
, params
):
169 Format text using params from dictionary. Function is resistant to missing parentheses
171 for k
, v
in params
.iteritems():
172 text
= text
.replace("{" + k
+ "}", str(v
))
176 def remove_extra_spaces(text
):
178 Removes multiple spaces within text and removes whitespace around the text
179 'Text with spaces ' becomes 'Text with spaces'
181 pattern
= re
.compile(r
" +")
182 return pattern
.sub(' ', text
).strip()
185 def remove_tags(text
):
187 Removes html-like tags from given text. Tag names aren't checked,
188 <no-valid-tag></no-valid-tag> pair will be removed.
190 pattern
= re
.compile(r
"<(\w|\/)[^<\s\"']*?>")
191 return remove_extra_spaces(pattern.sub(' ', text))
194 def render_markdown(text, escape_latex_math=True, md=None, **kwargs):
195 """ Mako markdown to HTML filter
196 :param text: Markdown source to convert to HTML
197 :param escape_latex_math: Whether math expression should
199 :param md: An alternative markdown processor (can be used
200 to generate e.g. a different format)
201 :param kwargs: Extra arguments to pass on to the markdown
204 if escape_latex_math:
207 def _math_replace(m):
208 math_segments.append(m.group(0))
209 return LATEX_MATH_PLACEHOLDER
211 text = re.sub(r'\$
[^\$
]+\$|\$\$
(^\$
)\$\$
', _math_replace, to_unicode(text))
214 result = markdown.markdown(bleach.clean(text, tags=BLEACH_ALLOWED_TAGS), **kwargs)
216 result = md(text, **kwargs)
218 if escape_latex_math:
219 return re.sub(LATEX_MATH_PLACEHOLDER, lambda _: math_segments.pop(0), result)
224 def sanitize_for_platypus(text):
225 """Sanitize HTML to be used in platypus"""
226 tags = ['b
', 'br
', 'em
', 'font
', 'i
', 'img
', 'strike
', 'strong
', 'sub
', 'sup
', 'u
', 'span
', 'div
', 'p
']
228 'font
': ['size
', 'face
', 'color
'],
229 'img
': ['src
', 'width
', 'height
', 'valign
']
231 res = bleach.clean(text, tags=tags, attributes=attrs, strip=True)
233 doc = html.fromstring(res)
234 return etree.tostring(doc)
237 # TODO: reference implementation from MaKaC
238 # but, it's
not totally correct according to RFC
, see test cases
239 # However, this regex is pretty good in term of practicality
240 # but it may be updated to cover all cases
241 VALID_EMAIL_REGEX
= re
.compile(r
"""[-a-zA-Z0-9!#$%&'*+/=?\^_`{|}~]+
242 (?:.[-a-zA-Z0-9!#$%&'*+/=?^_`{|}~]+)*
244 (?:[a-zA-Z0-9](?:[-a-zA-Z0-9]*[a-zA-Z0-9])?.)+
245 [a-zA-Z0-9](?:[-a-zA-Z0-9]*[a-zA-Z0-9])?""", re
.X
)
248 def is_valid_mail(emails_string
, multi
=True):
250 Checks the validity of an email address or a series of email addresses
252 - emails_string: a string representing a single email address or several
253 email addresses separated by separators
254 - multi: flag if multiple email addresses are allowed
256 Returns True if emails are valid.
259 emails
= re
.split(r
'[\s;,]+', emails_string
)
261 if not multi
and len(emails
) > 1:
264 return all(re
.match(VALID_EMAIL_REGEX
, email
) for email
in emails
if email
)
267 def natural_sort_key(s
, _nsre
=re
.compile('([0-9]+)')):
268 return [int(text
) if text
.isdigit() else text
.lower() for text
in re
.split(_nsre
, s
)]
271 def seems_html(string
):
272 return re
.search(r
'<[a-z]+?>', string
) is not None
275 def strip_control_chars(string
):
276 return re
.sub(r
'[\x0B-\x1F]', '', string
)
279 def html_color_to_rgb(hexcolor
):
281 convert #RRGGBB to an (R, G, B) tuple
284 if not hexcolor
.startswith('#'):
285 raise ValueError("Invalid color string '{}' (should start with '#')".format(hexcolor
))
287 hexcolor
= hexcolor
[1:]
289 if len(hexcolor
) not in {3, 6}:
290 raise ValueError("'#{}'' is not in #RRGGBB or #RGB format".format(hexcolor
))
292 if len(hexcolor
) == 3:
293 hexcolor
= ''.join(c
* 2 for c
in hexcolor
)
295 return tuple(float(int(hexcolor
[i
:i
+ 2], 16)) / 255 for i
in range(0, 6, 2))
298 def strip_whitespace(s
):
299 """Removes trailing/leading whitespace if a string was passed.
301 This utility is useful in cases where you might get None or
302 non-string values such as WTForms filters.
304 if isinstance(s
, basestring
):
309 def make_unique_token(is_unique
):
310 """Create a unique UUID4-based token
312 :param is_unique: a callable invoked with the token which should
313 return a boolean indicating if the token is actually
315 token
= unicode(uuid4())
316 while not is_unique(token
):
317 token
= unicode(uuid4())
323 def _wrapper(*args
, **kwargs
):
324 rv
= f(*args
, **kwargs
)
327 return rv
.encode('utf-8') if isinstance(rv
, unicode) else str(rv
)
332 def is_legacy_id(id_
):
333 """Checks if an ID is a broken legacy ID.
335 These IDs are not compatible with new code since they are not
336 numeric or have a leading zero, resulting in different objects
337 with the same numeric id.
339 return not isinstance(id_
, (int, long)) and (not id_
.isdigit() or str(int(id_
)) != id_
)