3 # PEP-540 (Add a new UTF-8 mode) makes a compelling argument for Python
4 # programs making special effort to work around misconfigured locale
5 # settings. This largely boils down to treating incoming byte sequences,
6 # i.e. command line arguments and environment variables, as UTF-8.
8 # This is specifically relevant when the POSIX (aka C) locale is in effect.
10 # https://www.python.org/dev/peps/pep-0540/
12 # The following functions help achieve this goal by using UTF-8 as a fallback
13 # encoding when the nominal encoding (sys.getfilesystemencoding()) fails.
17 if isinstance(b
, bytes
):
20 except UnicodeDecodeError:
21 return b
.decode('utf-8')
23 return os
.fsencode(b
).decode('utf-8')
29 except UnicodeEncodeError:
30 return s
.encode('utf-8')
33 def environ_get(key
, default
=None):
34 s
= os
.environ
.get(key
, default
)
38 return s
.encode('utf-8', 'surrogateescape').decode('utf-8')
41 def decode_utf8_with_latin1(input, errors
='strict'):
42 """Decode utf-8 bytes with possible latin-1 encoded bytes.
44 There are cases where encoded byte streams may nominally be utf-8 encoded,
45 but contain stray latin-1 (iso8859-1) characters. The input bytes are
46 decoded as utf-8, but with any non-utf-8 byte sequences decoded as latin-1.
48 This is the decode strategy employed by git when decoding utf-8 email
55 s
+= input.decode('utf-8', 'strict')
56 except UnicodeDecodeError as e
:
57 _
, _
, start
, end
, _
= e
.args
58 s
+= input[:start
].decode('utf-8')
59 s
+= input[start
:end
].decode('latin1')