update pydoc topics
[python/dscho.git] / Lib / email / utils.py
blob465903f88d37b2988be679268384f21b002e3969
1 # Copyright (C) 2001-2007 Python Software Foundation
2 # Author: Barry Warsaw
3 # Contact: email-sig@python.org
5 """Miscellaneous utilities."""
7 __all__ = [
8 'collapse_rfc2231_value',
9 'decode_params',
10 'decode_rfc2231',
11 'encode_rfc2231',
12 'formataddr',
13 'formatdate',
14 'getaddresses',
15 'make_msgid',
16 'parseaddr',
17 'parsedate',
18 'parsedate_tz',
19 'unquote',
22 import os
23 import re
24 import time
25 import base64
26 import random
27 import socket
28 import urllib.parse
29 import warnings
30 from io import StringIO
32 from email._parseaddr import quote
33 from email._parseaddr import AddressList as _AddressList
34 from email._parseaddr import mktime_tz
36 # We need wormarounds for bugs in these methods in older Pythons (see below)
37 from email._parseaddr import parsedate as _parsedate
38 from email._parseaddr import parsedate_tz as _parsedate_tz
40 from quopri import decodestring as _qdecode
42 # Intrapackage imports
43 from email.encoders import _bencode, _qencode
45 COMMASPACE = ', '
46 EMPTYSTRING = ''
47 UEMPTYSTRING = ''
48 CRLF = '\r\n'
49 TICK = "'"
51 specialsre = re.compile(r'[][\\()<>@,:;".]')
52 escapesre = re.compile(r'[][\\()"]')
56 # Helpers
58 def formataddr(pair):
59 """The inverse of parseaddr(), this takes a 2-tuple of the form
60 (realname, email_address) and returns the string value suitable
61 for an RFC 2822 From, To or Cc header.
63 If the first element of pair is false, then the second element is
64 returned unmodified.
65 """
66 name, address = pair
67 if name:
68 quotes = ''
69 if specialsre.search(name):
70 quotes = '"'
71 name = escapesre.sub(r'\\\g<0>', name)
72 return '%s%s%s <%s>' % (quotes, name, quotes, address)
73 return address
77 def getaddresses(fieldvalues):
78 """Return a list of (REALNAME, EMAIL) for each fieldvalue."""
79 all = COMMASPACE.join(fieldvalues)
80 a = _AddressList(all)
81 return a.addresslist
85 ecre = re.compile(r'''
86 =\? # literal =?
87 (?P<charset>[^?]*?) # non-greedy up to the next ? is the charset
88 \? # literal ?
89 (?P<encoding>[qb]) # either a "q" or a "b", case insensitive
90 \? # literal ?
91 (?P<atom>.*?) # non-greedy up to the next ?= is the atom
92 \?= # literal ?=
93 ''', re.VERBOSE | re.IGNORECASE)
97 def formatdate(timeval=None, localtime=False, usegmt=False):
98 """Returns a date string as specified by RFC 2822, e.g.:
100 Fri, 09 Nov 2001 01:08:47 -0000
102 Optional timeval if given is a floating point time value as accepted by
103 gmtime() and localtime(), otherwise the current time is used.
105 Optional localtime is a flag that when True, interprets timeval, and
106 returns a date relative to the local timezone instead of UTC, properly
107 taking daylight savings time into account.
109 Optional argument usegmt means that the timezone is written out as
110 an ascii string, not numeric one (so "GMT" instead of "+0000"). This
111 is needed for HTTP, and is only used when localtime==False.
113 # Note: we cannot use strftime() because that honors the locale and RFC
114 # 2822 requires that day and month names be the English abbreviations.
115 if timeval is None:
116 timeval = time.time()
117 if localtime:
118 now = time.localtime(timeval)
119 # Calculate timezone offset, based on whether the local zone has
120 # daylight savings time, and whether DST is in effect.
121 if time.daylight and now[-1]:
122 offset = time.altzone
123 else:
124 offset = time.timezone
125 hours, minutes = divmod(abs(offset), 3600)
126 # Remember offset is in seconds west of UTC, but the timezone is in
127 # minutes east of UTC, so the signs differ.
128 if offset > 0:
129 sign = '-'
130 else:
131 sign = '+'
132 zone = '%s%02d%02d' % (sign, hours, minutes // 60)
133 else:
134 now = time.gmtime(timeval)
135 # Timezone offset is always -0000
136 if usegmt:
137 zone = 'GMT'
138 else:
139 zone = '-0000'
140 return '%s, %02d %s %04d %02d:%02d:%02d %s' % (
141 ['Mon', 'Tue', 'Wed', 'Thu', 'Fri', 'Sat', 'Sun'][now[6]],
142 now[2],
143 ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun',
144 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec'][now[1] - 1],
145 now[0], now[3], now[4], now[5],
146 zone)
150 def make_msgid(idstring=None):
151 """Returns a string suitable for RFC 2822 compliant Message-ID, e.g:
153 <20020201195627.33539.96671@nightshade.la.mastaler.com>
155 Optional idstring if given is a string used to strengthen the
156 uniqueness of the message id.
158 timeval = time.time()
159 utcdate = time.strftime('%Y%m%d%H%M%S', time.gmtime(timeval))
160 pid = os.getpid()
161 randint = random.randrange(100000)
162 if idstring is None:
163 idstring = ''
164 else:
165 idstring = '.' + idstring
166 idhost = socket.getfqdn()
167 msgid = '<%s.%s.%s%s@%s>' % (utcdate, pid, randint, idstring, idhost)
168 return msgid
172 # These functions are in the standalone mimelib version only because they've
173 # subsequently been fixed in the latest Python versions. We use this to worm
174 # around broken older Pythons.
175 def parsedate(data):
176 if not data:
177 return None
178 return _parsedate(data)
181 def parsedate_tz(data):
182 if not data:
183 return None
184 return _parsedate_tz(data)
187 def parseaddr(addr):
188 addrs = _AddressList(addr).addresslist
189 if not addrs:
190 return '', ''
191 return addrs[0]
194 # rfc822.unquote() doesn't properly de-backslash-ify in Python pre-2.3.
195 def unquote(str):
196 """Remove quotes from a string."""
197 if len(str) > 1:
198 if str.startswith('"') and str.endswith('"'):
199 return str[1:-1].replace('\\\\', '\\').replace('\\"', '"')
200 if str.startswith('<') and str.endswith('>'):
201 return str[1:-1]
202 return str
206 # RFC2231-related functions - parameter encoding and decoding
207 def decode_rfc2231(s):
208 """Decode string according to RFC 2231"""
209 parts = s.split(TICK, 2)
210 if len(parts) <= 2:
211 return None, None, s
212 return parts
215 def encode_rfc2231(s, charset=None, language=None):
216 """Encode string according to RFC 2231.
218 If neither charset nor language is given, then s is returned as-is. If
219 charset is given but not language, the string is encoded using the empty
220 string for language.
222 s = urllib.parse.quote(s, safe='', encoding=charset or 'ascii')
223 if charset is None and language is None:
224 return s
225 if language is None:
226 language = ''
227 return "%s'%s'%s" % (charset, language, s)
230 rfc2231_continuation = re.compile(r'^(?P<name>\w+)\*((?P<num>[0-9]+)\*?)?$',
231 re.ASCII)
233 def decode_params(params):
234 """Decode parameters list according to RFC 2231.
236 params is a sequence of 2-tuples containing (param name, string value).
238 # Copy params so we don't mess with the original
239 params = params[:]
240 new_params = []
241 # Map parameter's name to a list of continuations. The values are a
242 # 3-tuple of the continuation number, the string value, and a flag
243 # specifying whether a particular segment is %-encoded.
244 rfc2231_params = {}
245 name, value = params.pop(0)
246 new_params.append((name, value))
247 while params:
248 name, value = params.pop(0)
249 if name.endswith('*'):
250 encoded = True
251 else:
252 encoded = False
253 value = unquote(value)
254 mo = rfc2231_continuation.match(name)
255 if mo:
256 name, num = mo.group('name', 'num')
257 if num is not None:
258 num = int(num)
259 rfc2231_params.setdefault(name, []).append((num, value, encoded))
260 else:
261 new_params.append((name, '"%s"' % quote(value)))
262 if rfc2231_params:
263 for name, continuations in rfc2231_params.items():
264 value = []
265 extended = False
266 # Sort by number
267 continuations.sort()
268 # And now append all values in numerical order, converting
269 # %-encodings for the encoded segments. If any of the
270 # continuation names ends in a *, then the entire string, after
271 # decoding segments and concatenating, must have the charset and
272 # language specifiers at the beginning of the string.
273 for num, s, encoded in continuations:
274 if encoded:
275 # Decode as "latin-1", so the characters in s directly
276 # represent the percent-encoded octet values.
277 # collapse_rfc2231_value treats this as an octet sequence.
278 s = urllib.parse.unquote(s, encoding="latin-1")
279 extended = True
280 value.append(s)
281 value = quote(EMPTYSTRING.join(value))
282 if extended:
283 charset, language, value = decode_rfc2231(value)
284 new_params.append((name, (charset, language, '"%s"' % value)))
285 else:
286 new_params.append((name, '"%s"' % value))
287 return new_params
289 def collapse_rfc2231_value(value, errors='replace',
290 fallback_charset='us-ascii'):
291 if not isinstance(value, tuple) or len(value) != 3:
292 return unquote(value)
293 # While value comes to us as a unicode string, we need it to be a bytes
294 # object. We do not want bytes() normal utf-8 decoder, we want a straight
295 # interpretation of the string as character bytes.
296 charset, language, text = value
297 rawbytes = bytes(text, 'raw-unicode-escape')
298 try:
299 return str(rawbytes, charset, errors)
300 except LookupError:
301 # charset is not a known codec.
302 return unquote(text)