Lib/email/utils.py

   1 # Copyright (C) 2001-2010 Python Software Foundation
   2 # Author: Barry Warsaw
   3 # Contact: email-sig@python.org
   4
   5 """Miscellaneous utilities."""
   6
   7 __all__ = [
   8     'collapse_rfc2231_value',
   9     'decode_params',
  10     'decode_rfc2231',
  11     'encode_rfc2231',
  12     'formataddr',
  13     'formatdate',
  14     'getaddresses',
  15     'make_msgid',
  16     'mktime_tz',
  17     'parseaddr',
  18     'parsedate',
  19     'parsedate_tz',
  20     'unquote',
  21     ]
  22
  23 import os
  24 import re
  25 import time
  26 import base64
  27 import random
  28 import socket
  29 import urllib
  30 import warnings
  31
  32 from email._parseaddr import quote
  33 from email._parseaddr import AddressList as _AddressList
  34 from email._parseaddr import mktime_tz
  35
  36 # We need wormarounds for bugs in these methods in older Pythons (see below)
  37 from email._parseaddr import parsedate as _parsedate
  38 from email._parseaddr import parsedate_tz as _parsedate_tz
  39
  40 from quopri import decodestring as _qdecode
  41
  42 # Intrapackage imports
  43 from email.encoders import _bencode, _qencode
  44
  45 COMMASPACE = ', '
  46 EMPTYSTRING = ''
  47 UEMPTYSTRING = u''
  48 CRLF = '\r\n'
  49 TICK = "'"
  50
  51 specialsre = re.compile(r'[][\\()<>@,:;".]')
  52 escapesre = re.compile(r'[][\\()"]')
  53
  54
  55 \f
  56 # Helpers
  57
  58 def _identity(s):
  59     return s
  60
  61
  62 def _bdecode(s):
  63     # We can't quite use base64.encodestring() since it tacks on a "courtesy
  64     # newline".  Blech!
  65     if not s:
  66         return s
  67     value = base64.decodestring(s)
  68     if not s.endswith('\n') and value.endswith('\n'):
  69         return value[:-1]
  70     return value
  71
  72
  73 \f
  74 def fix_eols(s):
  75     """Replace all line-ending characters with \r\n."""
  76     # Fix newlines with no preceding carriage return
  77     s = re.sub(r'(?<!\r)\n', CRLF, s)
  78     # Fix carriage returns with no following newline
  79     s = re.sub(r'\r(?!\n)', CRLF, s)
  80     return s
  81
  82
  83 \f
  84 def formataddr(pair):
  85     """The inverse of parseaddr(), this takes a 2-tuple of the form
  86     (realname, email_address) and returns the string value suitable
  87     for an RFC 2822 From, To or Cc header.
  88
  89     If the first element of pair is false, then the second element is
  90     returned unmodified.
  91     """
  92     name, address = pair
  93     if name:
  94         quotes = ''
  95         if specialsre.search(name):
  96             quotes = '"'
  97         name = escapesre.sub(r'\\\g<0>', name)
  98         return '%s%s%s <%s>' % (quotes, name, quotes, address)
  99     return address
 100
 101
 102 \f
 103 def getaddresses(fieldvalues):
 104     """Return a list of (REALNAME, EMAIL) for each fieldvalue."""
 105     all = COMMASPACE.join(fieldvalues)
 106     a = _AddressList(all)
 107     return a.addresslist
 108
 109
 110 \f
 111 ecre = re.compile(r'''
 112   =\?                   # literal =?
 113   (?P<charset>[^?]*?)   # non-greedy up to the next ? is the charset
 114   \?                    # literal ?
 115   (?P<encoding>[qb])    # either a "q" or a "b", case insensitive
 116   \?                    # literal ?
 117   (?P<atom>.*?)         # non-greedy up to the next ?= is the atom
 118   \?=                   # literal ?=
 119   ''', re.VERBOSE | re.IGNORECASE)
 120
 121
 122 \f
 123 def formatdate(timeval=None, localtime=False, usegmt=False):
 124     """Returns a date string as specified by RFC 2822, e.g.:
 125
 126     Fri, 09 Nov 2001 01:08:47 -0000
 127
 128     Optional timeval if given is a floating point time value as accepted by
 129     gmtime() and localtime(), otherwise the current time is used.
 130
 131     Optional localtime is a flag that when True, interprets timeval, and
 132     returns a date relative to the local timezone instead of UTC, properly
 133     taking daylight savings time into account.
 134
 135     Optional argument usegmt means that the timezone is written out as
 136     an ascii string, not numeric one (so "GMT" instead of "+0000"). This
 137     is needed for HTTP, and is only used when localtime==False.
 138     """
 139     # Note: we cannot use strftime() because that honors the locale and RFC
 140     # 2822 requires that day and month names be the English abbreviations.
 141     if timeval is None:
 142         timeval = time.time()
 143     if localtime:
 144         now = time.localtime(timeval)
 145         # Calculate timezone offset, based on whether the local zone has
 146         # daylight savings time, and whether DST is in effect.
 147         if time.daylight and now[-1]:
 148             offset = time.altzone
 149         else:
 150             offset = time.timezone
 151         hours, minutes = divmod(abs(offset), 3600)
 152         # Remember offset is in seconds west of UTC, but the timezone is in
 153         # minutes east of UTC, so the signs differ.
 154         if offset > 0:
 155             sign = '-'
 156         else:
 157             sign = '+'
 158         zone = '%s%02d%02d' % (sign, hours, minutes // 60)
 159     else:
 160         now = time.gmtime(timeval)
 161         # Timezone offset is always -0000
 162         if usegmt:
 163             zone = 'GMT'
 164         else:
 165             zone = '-0000'
 166     return '%s, %02d %s %04d %02d:%02d:%02d %s' % (
 167         ['Mon', 'Tue', 'Wed', 'Thu', 'Fri', 'Sat', 'Sun'][now[6]],
 168         now[2],
 169         ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun',
 170          'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec'][now[1] - 1],
 171         now[0], now[3], now[4], now[5],
 172         zone)
 173
 174
 175 \f
 176 def make_msgid(idstring=None):
 177     """Returns a string suitable for RFC 2822 compliant Message-ID, e.g:
 178
 179     <20020201195627.33539.96671@nightshade.la.mastaler.com>
 180
 181     Optional idstring if given is a string used to strengthen the
 182     uniqueness of the message id.
 183     """
 184     timeval = time.time()
 185     utcdate = time.strftime('%Y%m%d%H%M%S', time.gmtime(timeval))
 186     pid = os.getpid()
 187     randint = random.randrange(100000)
 188     if idstring is None:
 189         idstring = ''
 190     else:
 191         idstring = '.' + idstring
 192     idhost = socket.getfqdn()
 193     msgid = '<%s.%s.%s%s@%s>' % (utcdate, pid, randint, idstring, idhost)
 194     return msgid
 195
 196
 197 \f
 198 # These functions are in the standalone mimelib version only because they've
 199 # subsequently been fixed in the latest Python versions.  We use this to worm
 200 # around broken older Pythons.
 201 def parsedate(data):
 202     if not data:
 203         return None
 204     return _parsedate(data)
 205
 206
 207 def parsedate_tz(data):
 208     if not data:
 209         return None
 210     return _parsedate_tz(data)
 211
 212
 213 def parseaddr(addr):
 214     addrs = _AddressList(addr).addresslist
 215     if not addrs:
 216         return '', ''
 217     return addrs[0]
 218
 219
 220 # rfc822.unquote() doesn't properly de-backslash-ify in Python pre-2.3.
 221 def unquote(str):
 222     """Remove quotes from a string."""
 223     if len(str) > 1:
 224         if str.startswith('"') and str.endswith('"'):
 225             return str[1:-1].replace('\\\\', '\\').replace('\\"', '"')
 226         if str.startswith('<') and str.endswith('>'):
 227             return str[1:-1]
 228     return str
 229
 230
 231 \f
 232 # RFC2231-related functions - parameter encoding and decoding
 233 def decode_rfc2231(s):
 234     """Decode string according to RFC 2231"""
 235     parts = s.split(TICK, 2)
 236     if len(parts) <= 2:
 237         return None, None, s
 238     return parts
 239
 240
 241 def encode_rfc2231(s, charset=None, language=None):
 242     """Encode string according to RFC 2231.
 243
 244     If neither charset nor language is given, then s is returned as-is.  If
 245     charset is given but not language, the string is encoded using the empty
 246     string for language.
 247     """
 248     import urllib
 249     s = urllib.quote(s, safe='')
 250     if charset is None and language is None:
 251         return s
 252     if language is None:
 253         language = ''
 254     return "%s'%s'%s" % (charset, language, s)
 255
 256
 257 rfc2231_continuation = re.compile(r'^(?P<name>\w+)\*((?P<num>[0-9]+)\*?)?$')
 258
 259 def decode_params(params):
 260     """Decode parameters list according to RFC 2231.
 261
 262     params is a sequence of 2-tuples containing (param name, string value).
 263     """
 264     # Copy params so we don't mess with the original
 265     params = params[:]
 266     new_params = []
 267     # Map parameter's name to a list of continuations.  The values are a
 268     # 3-tuple of the continuation number, the string value, and a flag
 269     # specifying whether a particular segment is %-encoded.
 270     rfc2231_params = {}
 271     name, value = params.pop(0)
 272     new_params.append((name, value))
 273     while params:
 274         name, value = params.pop(0)
 275         if name.endswith('*'):
 276             encoded = True
 277         else:
 278             encoded = False
 279         value = unquote(value)
 280         mo = rfc2231_continuation.match(name)
 281         if mo:
 282             name, num = mo.group('name', 'num')
 283             if num is not None:
 284                 num = int(num)
 285             rfc2231_params.setdefault(name, []).append((num, value, encoded))
 286         else:
 287             new_params.append((name, '"%s"' % quote(value)))
 288     if rfc2231_params:
 289         for name, continuations in rfc2231_params.items():
 290             value = []
 291             extended = False
 292             # Sort by number
 293             continuations.sort()
 294             # And now append all values in numerical order, converting
 295             # %-encodings for the encoded segments.  If any of the
 296             # continuation names ends in a *, then the entire string, after
 297             # decoding segments and concatenating, must have the charset and
 298             # language specifiers at the beginning of the string.
 299             for num, s, encoded in continuations:
 300                 if encoded:
 301                     s = urllib.unquote(s)
 302                     extended = True
 303                 value.append(s)
 304             value = quote(EMPTYSTRING.join(value))
 305             if extended:
 306                 charset, language, value = decode_rfc2231(value)
 307                 new_params.append((name, (charset, language, '"%s"' % value)))
 308             else:
 309                 new_params.append((name, '"%s"' % value))
 310     return new_params
 311
 312 def collapse_rfc2231_value(value, errors='replace',
 313                            fallback_charset='us-ascii'):
 314     if isinstance(value, tuple):
 315         rawval = unquote(value[2])
 316         charset = value[0] or 'us-ascii'
 317         try:
 318             return unicode(rawval, charset, errors)
 319         except LookupError:
 320             # XXX charset is unknown to Python.
 321             return unicode(rawval, fallback_charset, errors)
 322     else:
 323         return unquote(value)