Lib/email/utils.py

   1 # Copyright (C) 2001-2006 Python Software Foundation
   2 # Author: Barry Warsaw
   3 # Contact: email-sig@python.org
   4
   5 """Miscellaneous utilities."""
   6
   7 __all__ = [
   8     'collapse_rfc2231_value',
   9     'decode_params',
  10     'decode_rfc2231',
  11     'encode_rfc2231',
  12     'formataddr',
  13     'formatdate',
  14     'getaddresses',
  15     'make_msgid',
  16     'parseaddr',
  17     'parsedate',
  18     'parsedate_tz',
  19     'unquote',
  20     ]
  21
  22 import os
  23 import re
  24 import time
  25 import base64
  26 import random
  27 import socket
  28 import urllib
  29 import warnings
  30
  31 from email._parseaddr import quote
  32 from email._parseaddr import AddressList as _AddressList
  33 from email._parseaddr import mktime_tz
  34
  35 # We need wormarounds for bugs in these methods in older Pythons (see below)
  36 from email._parseaddr import parsedate as _parsedate
  37 from email._parseaddr import parsedate_tz as _parsedate_tz
  38
  39 from quopri import decodestring as _qdecode
  40
  41 # Intrapackage imports
  42 from email.encoders import _bencode, _qencode
  43
  44 COMMASPACE = ', '
  45 EMPTYSTRING = ''
  46 UEMPTYSTRING = u''
  47 CRLF = '\r\n'
  48 TICK = "'"
  49
  50 specialsre = re.compile(r'[][\\()<>@,:;".]')
  51 escapesre = re.compile(r'[][\\()"]')
  52
  53
  54 \f
  55 # Helpers
  56
  57 def _identity(s):
  58     return s
  59
  60
  61 def _bdecode(s):
  62     # We can't quite use base64.encodestring() since it tacks on a "courtesy
  63     # newline".  Blech!
  64     if not s:
  65         return s
  66     value = base64.decodestring(s)
  67     if not s.endswith('\n') and value.endswith('\n'):
  68         return value[:-1]
  69     return value
  70
  71
  72 \f
  73 def fix_eols(s):
  74     """Replace all line-ending characters with \r\n."""
  75     # Fix newlines with no preceding carriage return
  76     s = re.sub(r'(?<!\r)\n', CRLF, s)
  77     # Fix carriage returns with no following newline
  78     s = re.sub(r'\r(?!\n)', CRLF, s)
  79     return s
  80
  81
  82 \f
  83 def formataddr(pair):
  84     """The inverse of parseaddr(), this takes a 2-tuple of the form
  85     (realname, email_address) and returns the string value suitable
  86     for an RFC 2822 From, To or Cc header.
  87
  88     If the first element of pair is false, then the second element is
  89     returned unmodified.
  90     """
  91     name, address = pair
  92     if name:
  93         quotes = ''
  94         if specialsre.search(name):
  95             quotes = '"'
  96         name = escapesre.sub(r'\\\g<0>', name)
  97         return '%s%s%s <%s>' % (quotes, name, quotes, address)
  98     return address
  99
 100
 101 \f
 102 def getaddresses(fieldvalues):
 103     """Return a list of (REALNAME, EMAIL) for each fieldvalue."""
 104     all = COMMASPACE.join(fieldvalues)
 105     a = _AddressList(all)
 106     return a.addresslist
 107
 108
 109 \f
 110 ecre = re.compile(r'''
 111   =\?                   # literal =?
 112   (?P<charset>[^?]*?)   # non-greedy up to the next ? is the charset
 113   \?                    # literal ?
 114   (?P<encoding>[qb])    # either a "q" or a "b", case insensitive
 115   \?                    # literal ?
 116   (?P<atom>.*?)         # non-greedy up to the next ?= is the atom
 117   \?=                   # literal ?=
 118   ''', re.VERBOSE | re.IGNORECASE)
 119
 120
 121 \f
 122 def formatdate(timeval=None, localtime=False, usegmt=False):
 123     """Returns a date string as specified by RFC 2822, e.g.:
 124
 125     Fri, 09 Nov 2001 01:08:47 -0000
 126
 127     Optional timeval if given is a floating point time value as accepted by
 128     gmtime() and localtime(), otherwise the current time is used.
 129
 130     Optional localtime is a flag that when True, interprets timeval, and
 131     returns a date relative to the local timezone instead of UTC, properly
 132     taking daylight savings time into account.
 133
 134     Optional argument usegmt means that the timezone is written out as
 135     an ascii string, not numeric one (so "GMT" instead of "+0000"). This
 136     is needed for HTTP, and is only used when localtime==False.
 137     """
 138     # Note: we cannot use strftime() because that honors the locale and RFC
 139     # 2822 requires that day and month names be the English abbreviations.
 140     if timeval is None:
 141         timeval = time.time()
 142     if localtime:
 143         now = time.localtime(timeval)
 144         # Calculate timezone offset, based on whether the local zone has
 145         # daylight savings time, and whether DST is in effect.
 146         if time.daylight and now[-1]:
 147             offset = time.altzone
 148         else:
 149             offset = time.timezone
 150         hours, minutes = divmod(abs(offset), 3600)
 151         # Remember offset is in seconds west of UTC, but the timezone is in
 152         # minutes east of UTC, so the signs differ.
 153         if offset > 0:
 154             sign = '-'
 155         else:
 156             sign = '+'
 157         zone = '%s%02d%02d' % (sign, hours, minutes // 60)
 158     else:
 159         now = time.gmtime(timeval)
 160         # Timezone offset is always -0000
 161         if usegmt:
 162             zone = 'GMT'
 163         else:
 164             zone = '-0000'
 165     return '%s, %02d %s %04d %02d:%02d:%02d %s' % (
 166         ['Mon', 'Tue', 'Wed', 'Thu', 'Fri', 'Sat', 'Sun'][now[6]],
 167         now[2],
 168         ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun',
 169          'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec'][now[1] - 1],
 170         now[0], now[3], now[4], now[5],
 171         zone)
 172
 173
 174 \f
 175 def make_msgid(idstring=None):
 176     """Returns a string suitable for RFC 2822 compliant Message-ID, e.g:
 177
 178     <20020201195627.33539.96671@nightshade.la.mastaler.com>
 179
 180     Optional idstring if given is a string used to strengthen the
 181     uniqueness of the message id.
 182     """
 183     timeval = time.time()
 184     utcdate = time.strftime('%Y%m%d%H%M%S', time.gmtime(timeval))
 185     pid = os.getpid()
 186     randint = random.randrange(100000)
 187     if idstring is None:
 188         idstring = ''
 189     else:
 190         idstring = '.' + idstring
 191     idhost = socket.getfqdn()
 192     msgid = '<%s.%s.%s%s@%s>' % (utcdate, pid, randint, idstring, idhost)
 193     return msgid
 194
 195
 196 \f
 197 # These functions are in the standalone mimelib version only because they've
 198 # subsequently been fixed in the latest Python versions.  We use this to worm
 199 # around broken older Pythons.
 200 def parsedate(data):
 201     if not data:
 202         return None
 203     return _parsedate(data)
 204
 205
 206 def parsedate_tz(data):
 207     if not data:
 208         return None
 209     return _parsedate_tz(data)
 210
 211
 212 def parseaddr(addr):
 213     addrs = _AddressList(addr).addresslist
 214     if not addrs:
 215         return '', ''
 216     return addrs[0]
 217
 218
 219 # rfc822.unquote() doesn't properly de-backslash-ify in Python pre-2.3.
 220 def unquote(str):
 221     """Remove quotes from a string."""
 222     if len(str) > 1:
 223         if str.startswith('"') and str.endswith('"'):
 224             return str[1:-1].replace('\\\\', '\\').replace('\\"', '"')
 225         if str.startswith('<') and str.endswith('>'):
 226             return str[1:-1]
 227     return str
 228
 229
 230 \f
 231 # RFC2231-related functions - parameter encoding and decoding
 232 def decode_rfc2231(s):
 233     """Decode string according to RFC 2231"""
 234     parts = s.split(TICK, 2)
 235     if len(parts) <= 2:
 236         return None, None, s
 237     return parts
 238
 239
 240 def encode_rfc2231(s, charset=None, language=None):
 241     """Encode string according to RFC 2231.
 242
 243     If neither charset nor language is given, then s is returned as-is.  If
 244     charset is given but not language, the string is encoded using the empty
 245     string for language.
 246     """
 247     import urllib
 248     s = urllib.quote(s, safe='')
 249     if charset is None and language is None:
 250         return s
 251     if language is None:
 252         language = ''
 253     return "%s'%s'%s" % (charset, language, s)
 254
 255
 256 rfc2231_continuation = re.compile(r'^(?P<name>\w+)\*((?P<num>[0-9]+)\*?)?$')
 257
 258 def decode_params(params):
 259     """Decode parameters list according to RFC 2231.
 260
 261     params is a sequence of 2-tuples containing (param name, string value).
 262     """
 263     # Copy params so we don't mess with the original
 264     params = params[:]
 265     new_params = []
 266     # Map parameter's name to a list of continuations.  The values are a
 267     # 3-tuple of the continuation number, the string value, and a flag
 268     # specifying whether a particular segment is %-encoded.
 269     rfc2231_params = {}
 270     name, value = params.pop(0)
 271     new_params.append((name, value))
 272     while params:
 273         name, value = params.pop(0)
 274         if name.endswith('*'):
 275             encoded = True
 276         else:
 277             encoded = False
 278         value = unquote(value)
 279         mo = rfc2231_continuation.match(name)
 280         if mo:
 281             name, num = mo.group('name', 'num')
 282             if num is not None:
 283                 num = int(num)
 284             rfc2231_params.setdefault(name, []).append((num, value, encoded))
 285         else:
 286             new_params.append((name, '"%s"' % quote(value)))
 287     if rfc2231_params:
 288         for name, continuations in rfc2231_params.items():
 289             value = []
 290             extended = False
 291             # Sort by number
 292             continuations.sort()
 293             # And now append all values in numerical order, converting
 294             # %-encodings for the encoded segments.  If any of the
 295             # continuation names ends in a *, then the entire string, after
 296             # decoding segments and concatenating, must have the charset and
 297             # language specifiers at the beginning of the string.
 298             for num, s, encoded in continuations:
 299                 if encoded:
 300                     s = urllib.unquote(s)
 301                     extended = True
 302                 value.append(s)
 303             value = quote(EMPTYSTRING.join(value))
 304             if extended:
 305                 charset, language, value = decode_rfc2231(value)
 306                 new_params.append((name, (charset, language, '"%s"' % value)))
 307             else:
 308                 new_params.append((name, '"%s"' % value))
 309     return new_params
 310
 311 def collapse_rfc2231_value(value, errors='replace',
 312                            fallback_charset='us-ascii'):
 313     if isinstance(value, tuple):
 314         rawval = unquote(value[2])
 315         charset = value[0] or 'us-ascii'
 316         try:
 317             return unicode(rawval, charset, errors)
 318         except LookupError:
 319             # XXX charset is unknown to Python.
 320             return unicode(rawval, fallback_charset, errors)
 321     else:
 322         return unquote(value)