Lib/email/utils.py

   1 # Copyright (C) 2001-2007 Python Software Foundation
   2 # Author: Barry Warsaw
   3 # Contact: email-sig@python.org
   4
   5 """Miscellaneous utilities."""
   6
   7 __all__ = [
   8     'collapse_rfc2231_value',
   9     'decode_params',
  10     'decode_rfc2231',
  11     'encode_rfc2231',
  12     'formataddr',
  13     'formatdate',
  14     'getaddresses',
  15     'make_msgid',
  16     'parseaddr',
  17     'parsedate',
  18     'parsedate_tz',
  19     'unquote',
  20     ]
  21
  22 import os
  23 import re
  24 import time
  25 import base64
  26 import random
  27 import socket
  28 import urllib.parse
  29 import warnings
  30 from io import StringIO
  31
  32 from email._parseaddr import quote
  33 from email._parseaddr import AddressList as _AddressList
  34 from email._parseaddr import mktime_tz
  35
  36 # We need wormarounds for bugs in these methods in older Pythons (see below)
  37 from email._parseaddr import parsedate as _parsedate
  38 from email._parseaddr import parsedate_tz as _parsedate_tz
  39
  40 from quopri import decodestring as _qdecode
  41
  42 # Intrapackage imports
  43 from email.encoders import _bencode, _qencode
  44
  45 COMMASPACE = ', '
  46 EMPTYSTRING = ''
  47 UEMPTYSTRING = ''
  48 CRLF = '\r\n'
  49 TICK = "'"
  50
  51 specialsre = re.compile(r'[][\\()<>@,:;".]')
  52 escapesre = re.compile(r'[][\\()"]')
  53
  54
  55
  56 # Helpers
  57
  58 def formataddr(pair):
  59     """The inverse of parseaddr(), this takes a 2-tuple of the form
  60     (realname, email_address) and returns the string value suitable
  61     for an RFC 2822 From, To or Cc header.
  62
  63     If the first element of pair is false, then the second element is
  64     returned unmodified.
  65     """
  66     name, address = pair
  67     if name:
  68         quotes = ''
  69         if specialsre.search(name):
  70             quotes = '"'
  71         name = escapesre.sub(r'\\\g<0>', name)
  72         return '%s%s%s <%s>' % (quotes, name, quotes, address)
  73     return address
  74
  75
  76
  77 def getaddresses(fieldvalues):
  78     """Return a list of (REALNAME, EMAIL) for each fieldvalue."""
  79     all = COMMASPACE.join(fieldvalues)
  80     a = _AddressList(all)
  81     return a.addresslist
  82
  83
  84
  85 ecre = re.compile(r'''
  86   =\?                   # literal =?
  87   (?P<charset>[^?]*?)   # non-greedy up to the next ? is the charset
  88   \?                    # literal ?
  89   (?P<encoding>[qb])    # either a "q" or a "b", case insensitive
  90   \?                    # literal ?
  91   (?P<atom>.*?)         # non-greedy up to the next ?= is the atom
  92   \?=                   # literal ?=
  93   ''', re.VERBOSE | re.IGNORECASE)
  94
  95
  96
  97 def formatdate(timeval=None, localtime=False, usegmt=False):
  98     """Returns a date string as specified by RFC 2822, e.g.:
  99
 100     Fri, 09 Nov 2001 01:08:47 -0000
 101
 102     Optional timeval if given is a floating point time value as accepted by
 103     gmtime() and localtime(), otherwise the current time is used.
 104
 105     Optional localtime is a flag that when True, interprets timeval, and
 106     returns a date relative to the local timezone instead of UTC, properly
 107     taking daylight savings time into account.
 108
 109     Optional argument usegmt means that the timezone is written out as
 110     an ascii string, not numeric one (so "GMT" instead of "+0000"). This
 111     is needed for HTTP, and is only used when localtime==False.
 112     """
 113     # Note: we cannot use strftime() because that honors the locale and RFC
 114     # 2822 requires that day and month names be the English abbreviations.
 115     if timeval is None:
 116         timeval = time.time()
 117     if localtime:
 118         now = time.localtime(timeval)
 119         # Calculate timezone offset, based on whether the local zone has
 120         # daylight savings time, and whether DST is in effect.
 121         if time.daylight and now[-1]:
 122             offset = time.altzone
 123         else:
 124             offset = time.timezone
 125         hours, minutes = divmod(abs(offset), 3600)
 126         # Remember offset is in seconds west of UTC, but the timezone is in
 127         # minutes east of UTC, so the signs differ.
 128         if offset > 0:
 129             sign = '-'
 130         else:
 131             sign = '+'
 132         zone = '%s%02d%02d' % (sign, hours, minutes // 60)
 133     else:
 134         now = time.gmtime(timeval)
 135         # Timezone offset is always -0000
 136         if usegmt:
 137             zone = 'GMT'
 138         else:
 139             zone = '-0000'
 140     return '%s, %02d %s %04d %02d:%02d:%02d %s' % (
 141         ['Mon', 'Tue', 'Wed', 'Thu', 'Fri', 'Sat', 'Sun'][now[6]],
 142         now[2],
 143         ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun',
 144          'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec'][now[1] - 1],
 145         now[0], now[3], now[4], now[5],
 146         zone)
 147
 148
 149
 150 def make_msgid(idstring=None):
 151     """Returns a string suitable for RFC 2822 compliant Message-ID, e.g:
 152
 153     <20020201195627.33539.96671@nightshade.la.mastaler.com>
 154
 155     Optional idstring if given is a string used to strengthen the
 156     uniqueness of the message id.
 157     """
 158     timeval = time.time()
 159     utcdate = time.strftime('%Y%m%d%H%M%S', time.gmtime(timeval))
 160     pid = os.getpid()
 161     randint = random.randrange(100000)
 162     if idstring is None:
 163         idstring = ''
 164     else:
 165         idstring = '.' + idstring
 166     idhost = socket.getfqdn()
 167     msgid = '<%s.%s.%s%s@%s>' % (utcdate, pid, randint, idstring, idhost)
 168     return msgid
 169
 170
 171
 172 # These functions are in the standalone mimelib version only because they've
 173 # subsequently been fixed in the latest Python versions.  We use this to worm
 174 # around broken older Pythons.
 175 def parsedate(data):
 176     if not data:
 177         return None
 178     return _parsedate(data)
 179
 180
 181 def parsedate_tz(data):
 182     if not data:
 183         return None
 184     return _parsedate_tz(data)
 185
 186
 187 def parseaddr(addr):
 188     addrs = _AddressList(addr).addresslist
 189     if not addrs:
 190         return '', ''
 191     return addrs[0]
 192
 193
 194 # rfc822.unquote() doesn't properly de-backslash-ify in Python pre-2.3.
 195 def unquote(str):
 196     """Remove quotes from a string."""
 197     if len(str) > 1:
 198         if str.startswith('"') and str.endswith('"'):
 199             return str[1:-1].replace('\\\\', '\\').replace('\\"', '"')
 200         if str.startswith('<') and str.endswith('>'):
 201             return str[1:-1]
 202     return str
 203
 204
 205
 206 # RFC2231-related functions - parameter encoding and decoding
 207 def decode_rfc2231(s):
 208     """Decode string according to RFC 2231"""
 209     parts = s.split(TICK, 2)
 210     if len(parts) <= 2:
 211         return None, None, s
 212     return parts
 213
 214
 215 def encode_rfc2231(s, charset=None, language=None):
 216     """Encode string according to RFC 2231.
 217
 218     If neither charset nor language is given, then s is returned as-is.  If
 219     charset is given but not language, the string is encoded using the empty
 220     string for language.
 221     """
 222     s = urllib.parse.quote(s, safe='', encoding=charset or 'ascii')
 223     if charset is None and language is None:
 224         return s
 225     if language is None:
 226         language = ''
 227     return "%s'%s'%s" % (charset, language, s)
 228
 229
 230 rfc2231_continuation = re.compile(r'^(?P<name>\w+)\*((?P<num>[0-9]+)\*?)?$',
 231     re.ASCII)
 232
 233 def decode_params(params):
 234     """Decode parameters list according to RFC 2231.
 235
 236     params is a sequence of 2-tuples containing (param name, string value).
 237     """
 238     # Copy params so we don't mess with the original
 239     params = params[:]
 240     new_params = []
 241     # Map parameter's name to a list of continuations.  The values are a
 242     # 3-tuple of the continuation number, the string value, and a flag
 243     # specifying whether a particular segment is %-encoded.
 244     rfc2231_params = {}
 245     name, value = params.pop(0)
 246     new_params.append((name, value))
 247     while params:
 248         name, value = params.pop(0)
 249         if name.endswith('*'):
 250             encoded = True
 251         else:
 252             encoded = False
 253         value = unquote(value)
 254         mo = rfc2231_continuation.match(name)
 255         if mo:
 256             name, num = mo.group('name', 'num')
 257             if num is not None:
 258                 num = int(num)
 259             rfc2231_params.setdefault(name, []).append((num, value, encoded))
 260         else:
 261             new_params.append((name, '"%s"' % quote(value)))
 262     if rfc2231_params:
 263         for name, continuations in rfc2231_params.items():
 264             value = []
 265             extended = False
 266             # Sort by number
 267             continuations.sort()
 268             # And now append all values in numerical order, converting
 269             # %-encodings for the encoded segments.  If any of the
 270             # continuation names ends in a *, then the entire string, after
 271             # decoding segments and concatenating, must have the charset and
 272             # language specifiers at the beginning of the string.
 273             for num, s, encoded in continuations:
 274                 if encoded:
 275                     # Decode as "latin-1", so the characters in s directly
 276                     # represent the percent-encoded octet values.
 277                     # collapse_rfc2231_value treats this as an octet sequence.
 278                     s = urllib.parse.unquote(s, encoding="latin-1")
 279                     extended = True
 280                 value.append(s)
 281             value = quote(EMPTYSTRING.join(value))
 282             if extended:
 283                 charset, language, value = decode_rfc2231(value)
 284                 new_params.append((name, (charset, language, '"%s"' % value)))
 285             else:
 286                 new_params.append((name, '"%s"' % value))
 287     return new_params
 288
 289 def collapse_rfc2231_value(value, errors='replace',
 290                            fallback_charset='us-ascii'):
 291     if not isinstance(value, tuple) or len(value) != 3:
 292         return unquote(value)
 293     # While value comes to us as a unicode string, we need it to be a bytes
 294     # object.  We do not want bytes() normal utf-8 decoder, we want a straight
 295     # interpretation of the string as character bytes.
 296     charset, language, text = value
 297     rawbytes = bytes(text, 'raw-unicode-escape')
 298     try:
 299         return str(rawbytes, charset, errors)
 300     except LookupError:
 301         # charset is not a known codec.
 302         return unquote(text)