stgit/compat.py

   1 import os
   2
   3 # PEP-540 (Add a new UTF-8 mode) makes a compelling argument for Python
   4 # programs making special effort to work around misconfigured locale
   5 # settings. This largely boils down to treating incoming byte sequences,
   6 # i.e. command line arguments and environment variables, as UTF-8.
   7 #
   8 # This is specifically relevant when the POSIX (aka C) locale is in effect.
   9 #
  10 # https://www.python.org/dev/peps/pep-0540/
  11 #
  12 # The following functions help achieve this goal by using UTF-8 as a fallback
  13 # encoding when the nominal encoding (sys.getfilesystemencoding()) fails.
  14
  15
  16 def fsdecode_utf8(b):
  17     if isinstance(b, bytes):
  18         try:
  19             return os.fsdecode(b)
  20         except UnicodeDecodeError:
  21             return b.decode('utf-8')
  22     else:
  23         return os.fsencode(b).decode('utf-8')
  24
  25
  26 def fsencode_utf8(s):
  27     try:
  28         return os.fsencode(s)
  29     except UnicodeEncodeError:
  30         return s.encode('utf-8')
  31
  32
  33 def environ_get(key, default=None):
  34     s = os.environ.get(key, default)
  35     if s is default:
  36         return default
  37     else:
  38         return s.encode('utf-8', 'surrogateescape').decode('utf-8')
  39
  40
  41 def decode_utf8_with_latin1(input, errors='strict'):
  42     """Decode utf-8 bytes with possible latin-1 encoded bytes.
  43
  44     There are cases where encoded byte streams may nominally be utf-8 encoded,
  45     but contain stray latin-1 (iso8859-1) characters. The input bytes are
  46     decoded as utf-8, but with any non-utf-8 byte sequences decoded as latin-1.
  47
  48     This is the decode strategy employed by git when decoding utf-8 email
  49     bodies.
  50
  51     """
  52     s = ''
  53     while True:
  54         try:
  55             s += input.decode('utf-8', 'strict')
  56         except UnicodeDecodeError as e:
  57             _, _, start, end, _ = e.args
  58             s += input[:start].decode('utf-8')
  59             s += input[start:end].decode('latin1')
  60             input = input[end:]
  61         else:
  62             break
  63     return s