stg import now extracts Message-ID header
[stgit.git] / stgit / compat.py
blob4fe50f67440544615b2fb977dc785540b8db90c3
1 import os
3 # PEP-540 (Add a new UTF-8 mode) makes a compelling argument for Python
4 # programs making special effort to work around misconfigured locale
5 # settings. This largely boils down to treating incoming byte sequences,
6 # i.e. command line arguments and environment variables, as UTF-8.
8 # This is specifically relevant when the POSIX (aka C) locale is in effect.
10 # https://www.python.org/dev/peps/pep-0540/
12 # The following functions help achieve this goal by using UTF-8 as a fallback
13 # encoding when the nominal encoding (sys.getfilesystemencoding()) fails.
16 def fsdecode_utf8(b):
17 if isinstance(b, bytes):
18 try:
19 return os.fsdecode(b)
20 except UnicodeDecodeError:
21 return b.decode('utf-8')
22 else:
23 return os.fsencode(b).decode('utf-8')
26 def fsencode_utf8(s):
27 try:
28 return os.fsencode(s)
29 except UnicodeEncodeError:
30 return s.encode('utf-8')
33 def environ_get(key, default=None):
34 s = os.environ.get(key, default)
35 if s is default:
36 return default
37 else:
38 return s.encode('utf-8', 'surrogateescape').decode('utf-8')
41 def decode_utf8_with_latin1(input, errors='strict'):
42 """Decode utf-8 bytes with possible latin-1 encoded bytes.
44 There are cases where encoded byte streams may nominally be utf-8 encoded,
45 but contain stray latin-1 (iso8859-1) characters. The input bytes are
46 decoded as utf-8, but with any non-utf-8 byte sequences decoded as latin-1.
48 This is the decode strategy employed by git when decoding utf-8 email
49 bodies.
51 """
52 s = ''
53 while True:
54 try:
55 s += input.decode('utf-8', 'strict')
56 except UnicodeDecodeError as e:
57 _, _, start, end, _ = e.args
58 s += input[:start].decode('utf-8')
59 s += input[start:end].decode('latin1')
60 input = input[end:]
61 else:
62 break
63 return s