Lib/email/quopriMIME.py

   1 # Copyright (C) 2001-2004 Python Software Foundation
   2 # Author: Ben Gertzfield
   3 # Contact: email-sig@python.org
   4
   5 """Quoted-printable content transfer encoding per RFCs 2045-2047.
   6
   7 This module handles the content transfer encoding method defined in RFC 2045
   8 to encode US ASCII-like 8-bit data called `quoted-printable'.  It is used to
   9 safely encode text that is in a character set similar to the 7-bit US ASCII
  10 character set, but that includes some 8-bit characters that are normally not
  11 allowed in email bodies or headers.
  12
  13 Quoted-printable is very space-inefficient for encoding binary files; use the
  14 email.base64MIME module for that instead.
  15
  16 This module provides an interface to encode and decode both headers and bodies
  17 with quoted-printable encoding.
  18
  19 RFC 2045 defines a method for including character set information in an
  20 `encoded-word' in a header.  This method is commonly used for 8-bit real names
  21 in To:/From:/Cc: etc. fields, as well as Subject: lines.
  22
  23 This module does not do the line wrapping or end-of-line character
  24 conversion necessary for proper internationalized headers; it only
  25 does dumb encoding and decoding.  To deal with the various line
  26 wrapping issues, use the email.Header module.
  27 """
  28
  29 import re
  30 from string import hexdigits
  31 from email.Utils import fix_eols
  32
  33 CRLF = '\r\n'
  34 NL = '\n'
  35
  36 # See also Charset.py
  37 MISC_LEN = 7
  38
  39 hqre = re.compile(r'[^-a-zA-Z0-9!*+/ ]')
  40 bqre = re.compile(r'[^ !-<>-~\t]')
  41
  42
  43 \f
  44 # Helpers
  45 def header_quopri_check(c):
  46     """Return True if the character should be escaped with header quopri."""
  47     return bool(hqre.match(c))
  48
  49
  50 def body_quopri_check(c):
  51     """Return True if the character should be escaped with body quopri."""
  52     return bool(bqre.match(c))
  53
  54
  55 def header_quopri_len(s):
  56     """Return the length of str when it is encoded with header quopri."""
  57     count = 0
  58     for c in s:
  59         if hqre.match(c):
  60             count += 3
  61         else:
  62             count += 1
  63     return count
  64
  65
  66 def body_quopri_len(str):
  67     """Return the length of str when it is encoded with body quopri."""
  68     count = 0
  69     for c in str:
  70         if bqre.match(c):
  71             count += 3
  72         else:
  73             count += 1
  74     return count
  75
  76
  77 def _max_append(L, s, maxlen, extra=''):
  78     if not L:
  79         L.append(s.lstrip())
  80     elif len(L[-1]) + len(s) <= maxlen:
  81         L[-1] += extra + s
  82     else:
  83         L.append(s.lstrip())
  84
  85
  86 def unquote(s):
  87     """Turn a string in the form =AB to the ASCII character with value 0xab"""
  88     return chr(int(s[1:3], 16))
  89
  90
  91 def quote(c):
  92     return "=%02X" % ord(c)
  93
  94
  95 \f
  96 def header_encode(header, charset="iso-8859-1", keep_eols=False,
  97                   maxlinelen=76, eol=NL):
  98     """Encode a single header line with quoted-printable (like) encoding.
  99
 100     Defined in RFC 2045, this `Q' encoding is similar to quoted-printable, but
 101     used specifically for email header fields to allow charsets with mostly 7
 102     bit characters (and some 8 bit) to remain more or less readable in non-RFC
 103     2045 aware mail clients.
 104
 105     charset names the character set to use to encode the header.  It defaults
 106     to iso-8859-1.
 107
 108     The resulting string will be in the form:
 109
 110     "=?charset?q?I_f=E2rt_in_your_g=E8n=E8ral_dire=E7tion?\\n
 111       =?charset?q?Silly_=C8nglish_Kn=EEghts?="
 112
 113     with each line wrapped safely at, at most, maxlinelen characters (defaults
 114     to 76 characters).  If maxlinelen is None, the entire string is encoded in
 115     one chunk with no splitting.
 116
 117     End-of-line characters (\\r, \\n, \\r\\n) will be automatically converted
 118     to the canonical email line separator \\r\\n unless the keep_eols
 119     parameter is True (the default is False).
 120
 121     Each line of the header will be terminated in the value of eol, which
 122     defaults to "\\n".  Set this to "\\r\\n" if you are using the result of
 123     this function directly in email.
 124     """
 125     # Return empty headers unchanged
 126     if not header:
 127         return header
 128
 129     if not keep_eols:
 130         header = fix_eols(header)
 131
 132     # Quopri encode each line, in encoded chunks no greater than maxlinelen in
 133     # length, after the RFC chrome is added in.
 134     quoted = []
 135     if maxlinelen is None:
 136         # An obnoxiously large number that's good enough
 137         max_encoded = 100000
 138     else:
 139         max_encoded = maxlinelen - len(charset) - MISC_LEN - 1
 140
 141     for c in header:
 142         # Space may be represented as _ instead of =20 for readability
 143         if c == ' ':
 144             _max_append(quoted, '_', max_encoded)
 145         # These characters can be included verbatim
 146         elif not hqre.match(c):
 147             _max_append(quoted, c, max_encoded)
 148         # Otherwise, replace with hex value like =E2
 149         else:
 150             _max_append(quoted, "=%02X" % ord(c), max_encoded)
 151
 152     # Now add the RFC chrome to each encoded chunk and glue the chunks
 153     # together.  BAW: should we be able to specify the leading whitespace in
 154     # the joiner?
 155     joiner = eol + ' '
 156     return joiner.join(['=?%s?q?%s?=' % (charset, line) for line in quoted])
 157
 158
 159 \f
 160 def encode(body, binary=False, maxlinelen=76, eol=NL):
 161     """Encode with quoted-printable, wrapping at maxlinelen characters.
 162
 163     If binary is False (the default), end-of-line characters will be converted
 164     to the canonical email end-of-line sequence \\r\\n.  Otherwise they will
 165     be left verbatim.
 166
 167     Each line of encoded text will end with eol, which defaults to "\\n".  Set
 168     this to "\\r\\n" if you will be using the result of this function directly
 169     in an email.
 170
 171     Each line will be wrapped at, at most, maxlinelen characters (defaults to
 172     76 characters).  Long lines will have the `soft linefeed' quoted-printable
 173     character "=" appended to them, so the decoded text will be identical to
 174     the original text.
 175     """
 176     if not body:
 177         return body
 178
 179     if not binary:
 180         body = fix_eols(body)
 181
 182     # BAW: We're accumulating the body text by string concatenation.  That
 183     # can't be very efficient, but I don't have time now to rewrite it.  It
 184     # just feels like this algorithm could be more efficient.
 185     encoded_body = ''
 186     lineno = -1
 187     # Preserve line endings here so we can check later to see an eol needs to
 188     # be added to the output later.
 189     lines = body.splitlines(1)
 190     for line in lines:
 191         # But strip off line-endings for processing this line.
 192         if line.endswith(CRLF):
 193             line = line[:-2]
 194         elif line[-1] in CRLF:
 195             line = line[:-1]
 196
 197         lineno += 1
 198         encoded_line = ''
 199         prev = None
 200         linelen = len(line)
 201         # Now we need to examine every character to see if it needs to be
 202         # quopri encoded.  BAW: again, string concatenation is inefficient.
 203         for j in range(linelen):
 204             c = line[j]
 205             prev = c
 206             if bqre.match(c):
 207                 c = quote(c)
 208             elif j+1 == linelen:
 209                 # Check for whitespace at end of line; special case
 210                 if c not in ' \t':
 211                     encoded_line += c
 212                 prev = c
 213                 continue
 214             # Check to see to see if the line has reached its maximum length
 215             if len(encoded_line) + len(c) >= maxlinelen:
 216                 encoded_body += encoded_line + '=' + eol
 217                 encoded_line = ''
 218             encoded_line += c
 219         # Now at end of line..
 220         if prev and prev in ' \t':
 221             # Special case for whitespace at end of file
 222             if lineno + 1 == len(lines):
 223                 prev = quote(prev)
 224                 if len(encoded_line) + len(prev) > maxlinelen:
 225                     encoded_body += encoded_line + '=' + eol + prev
 226                 else:
 227                     encoded_body += encoded_line + prev
 228             # Just normal whitespace at end of line
 229             else:
 230                 encoded_body += encoded_line + prev + '=' + eol
 231             encoded_line = ''
 232         # Now look at the line we just finished and it has a line ending, we
 233         # need to add eol to the end of the line.
 234         if lines[lineno].endswith(CRLF) or lines[lineno][-1] in CRLF:
 235             encoded_body += encoded_line + eol
 236         else:
 237             encoded_body += encoded_line
 238         encoded_line = ''
 239     return encoded_body
 240
 241
 242 # For convenience and backwards compatibility w/ standard base64 module
 243 body_encode = encode
 244 encodestring = encode
 245
 246
 247 \f
 248 # BAW: I'm not sure if the intent was for the signature of this function to be
 249 # the same as base64MIME.decode() or not...
 250 def decode(encoded, eol=NL):
 251     """Decode a quoted-printable string.
 252
 253     Lines are separated with eol, which defaults to \\n.
 254     """
 255     if not encoded:
 256         return encoded
 257     # BAW: see comment in encode() above.  Again, we're building up the
 258     # decoded string with string concatenation, which could be done much more
 259     # efficiently.
 260     decoded = ''
 261
 262     for line in encoded.splitlines():
 263         line = line.rstrip()
 264         if not line:
 265             decoded += eol
 266             continue
 267
 268         i = 0
 269         n = len(line)
 270         while i < n:
 271             c = line[i]
 272             if c <> '=':
 273                 decoded += c
 274                 i += 1
 275             # Otherwise, c == "=".  Are we at the end of the line?  If so, add
 276             # a soft line break.
 277             elif i+1 == n:
 278                 i += 1
 279                 continue
 280             # Decode if in form =AB
 281             elif i+2 < n and line[i+1] in hexdigits and line[i+2] in hexdigits:
 282                 decoded += unquote(line[i:i+3])
 283                 i += 3
 284             # Otherwise, not in form =AB, pass literally
 285             else:
 286                 decoded += c
 287                 i += 1
 288
 289             if i == n:
 290                 decoded += eol
 291     # Special case if original string did not end with eol
 292     if not encoded.endswith(eol) and decoded.endswith(eol):
 293         decoded = decoded[:-1]
 294     return decoded
 295
 296
 297 # For convenience and backwards compatibility w/ standard base64 module
 298 body_decode = decode
 299 decodestring = decode
 300
 301
 302 \f
 303 def _unquote_match(match):
 304     """Turn a match in the form =AB to the ASCII character with value 0xab"""
 305     s = match.group(0)
 306     return unquote(s)
 307
 308
 309 # Header decoding is done a bit differently
 310 def header_decode(s):
 311     """Decode a string encoded with RFC 2045 MIME header `Q' encoding.
 312
 313     This function does not parse a full MIME header value encoded with
 314     quoted-printable (like =?iso-8895-1?q?Hello_World?=) -- please use
 315     the high level email.Header class for that functionality.
 316     """
 317     s = s.replace('_', ' ')
 318     return re.sub(r'=\w{2}', _unquote_match, s)