Doc/lib/emailcharsets.tex

   1 \declaremodule{standard}{email.charset}
   2 \modulesynopsis{Character Sets}
   3
   4 This module provides a class \class{Charset} for representing
   5 character sets and character set conversions in email messages, as
   6 well as a character set registry and several convenience methods for
   7 manipulating this registry.  Instances of \class{Charset} are used in
   8 several other modules within the \module{email} package.
   9
  10 Import this class from the \module{email.charset} module.
  11
  12 \versionadded{2.2.2}
  13
  14 \begin{classdesc}{Charset}{\optional{input_charset}}
  15 Map character sets to their email properties.
  16
  17 This class provides information about the requirements imposed on
  18 email for a specific character set.  It also provides convenience
  19 routines for converting between character sets, given the availability
  20 of the applicable codecs.  Given a character set, it will do its best
  21 to provide information on how to use that character set in an email
  22 message in an RFC-compliant way.
  23
  24 Certain character sets must be encoded with quoted-printable or base64
  25 when used in email headers or bodies.  Certain character sets must be
  26 converted outright, and are not allowed in email.
  27
  28 Optional \var{input_charset} is as described below; it is always
  29 coerced to lower case.  After being alias normalized it is also used
  30 as a lookup into the registry of character sets to find out the header
  31 encoding, body encoding, and output conversion codec to be used for
  32 the character set.  For example, if
  33 \var{input_charset} is \code{iso-8859-1}, then headers and bodies will
  34 be encoded using quoted-printable and no output conversion codec is
  35 necessary.  If \var{input_charset} is \code{euc-jp}, then headers will
  36 be encoded with base64, bodies will not be encoded, but output text
  37 will be converted from the \code{euc-jp} character set to the
  38 \code{iso-2022-jp} character set.
  39 \end{classdesc}
  40
  41 \class{Charset} instances have the following data attributes:
  42
  43 \begin{datadesc}{input_charset}
  44 The initial character set specified.  Common aliases are converted to
  45 their \emph{official} email names (e.g. \code{latin_1} is converted to
  46 \code{iso-8859-1}).  Defaults to 7-bit \code{us-ascii}.
  47 \end{datadesc}
  48
  49 \begin{datadesc}{header_encoding}
  50 If the character set must be encoded before it can be used in an
  51 email header, this attribute will be set to \code{Charset.QP} (for
  52 quoted-printable), \code{Charset.BASE64} (for base64 encoding), or
  53 \code{Charset.SHORTEST} for the shortest of QP or BASE64 encoding.
  54 Otherwise, it will be \code{None}.
  55 \end{datadesc}
  56
  57 \begin{datadesc}{body_encoding}
  58 Same as \var{header_encoding}, but describes the encoding for the
  59 mail message's body, which indeed may be different than the header
  60 encoding.  \code{Charset.SHORTEST} is not allowed for
  61 \var{body_encoding}.
  62 \end{datadesc}
  63
  64 \begin{datadesc}{output_charset}
  65 Some character sets must be converted before they can be used in
  66 email headers or bodies.  If the \var{input_charset} is one of
  67 them, this attribute will contain the name of the character set
  68 output will be converted to.  Otherwise, it will be \code{None}.
  69 \end{datadesc}
  70
  71 \begin{datadesc}{input_codec}
  72 The name of the Python codec used to convert the \var{input_charset} to
  73 Unicode.  If no conversion codec is necessary, this attribute will be
  74 \code{None}.
  75 \end{datadesc}
  76
  77 \begin{datadesc}{output_codec}
  78 The name of the Python codec used to convert Unicode to the
  79 \var{output_charset}.  If no conversion codec is necessary, this
  80 attribute will have the same value as the \var{input_codec}.
  81 \end{datadesc}
  82
  83 \class{Charset} instances also have the following methods:
  84
  85 \begin{methoddesc}[Charset]{get_body_encoding}{}
  86 Return the content transfer encoding used for body encoding.
  87
  88 This is either the string \samp{quoted-printable} or \samp{base64}
  89 depending on the encoding used, or it is a function, in which case you
  90 should call the function with a single argument, the Message object
  91 being encoded.  The function should then set the
  92 \mailheader{Content-Transfer-Encoding} header itself to whatever is
  93 appropriate.
  94
  95 Returns the string \samp{quoted-printable} if
  96 \var{body_encoding} is \code{QP}, returns the string
  97 \samp{base64} if \var{body_encoding} is \code{BASE64}, and returns the
  98 string \samp{7bit} otherwise.
  99 \end{methoddesc}
 100
 101 \begin{methoddesc}{convert}{s}
 102 Convert the string \var{s} from the \var{input_codec} to the
 103 \var{output_codec}.
 104 \end{methoddesc}
 105
 106 \begin{methoddesc}{to_splittable}{s}
 107 Convert a possibly multibyte string to a safely splittable format.
 108 \var{s} is the string to split.
 109
 110 Uses the \var{input_codec} to try and convert the string to Unicode,
 111 so it can be safely split on character boundaries (even for multibyte
 112 characters).
 113
 114 Returns the string as-is if it isn't known how to convert \var{s} to
 115 Unicode with the \var{input_charset}.
 116
 117 Characters that could not be converted to Unicode will be replaced
 118 with the Unicode replacement character \character{U+FFFD}.
 119 \end{methoddesc}
 120
 121 \begin{methoddesc}{from_splittable}{ustr\optional{, to_output}}
 122 Convert a splittable string back into an encoded string.  \var{ustr}
 123 is a Unicode string to ``unsplit''.
 124
 125 This method uses the proper codec to try and convert the string from
 126 Unicode back into an encoded format.  Return the string as-is if it is
 127 not Unicode, or if it could not be converted from Unicode.
 128
 129 Characters that could not be converted from Unicode will be replaced
 130 with an appropriate character (usually \character{?}).
 131
 132 If \var{to_output} is \code{True} (the default), uses
 133 \var{output_codec} to convert to an
 134 encoded format.  If \var{to_output} is \code{False}, it uses
 135 \var{input_codec}.
 136 \end{methoddesc}
 137
 138 \begin{methoddesc}{get_output_charset}{}
 139 Return the output character set.
 140
 141 This is the \var{output_charset} attribute if that is not \code{None},
 142 otherwise it is \var{input_charset}.
 143 \end{methoddesc}
 144
 145 \begin{methoddesc}{encoded_header_len}{}
 146 Return the length of the encoded header string, properly calculating
 147 for quoted-printable or base64 encoding.
 148 \end{methoddesc}
 149
 150 \begin{methoddesc}{header_encode}{s\optional{, convert}}
 151 Header-encode the string \var{s}.
 152
 153 If \var{convert} is \code{True}, the string will be converted from the
 154 input charset to the output charset automatically.  This is not useful
 155 for multibyte character sets, which have line length issues (multibyte
 156 characters must be split on a character, not a byte boundary); use the
 157 higher-level \class{Header} class to deal with these issues (see
 158 \refmodule{email.header}).  \var{convert} defaults to \code{False}.
 159
 160 The type of encoding (base64 or quoted-printable) will be based on
 161 the \var{header_encoding} attribute.
 162 \end{methoddesc}
 163
 164 \begin{methoddesc}{body_encode}{s\optional{, convert}}
 165 Body-encode the string \var{s}.
 166
 167 If \var{convert} is \code{True} (the default), the string will be
 168 converted from the input charset to output charset automatically.
 169 Unlike \method{header_encode()}, there are no issues with byte
 170 boundaries and multibyte charsets in email bodies, so this is usually
 171 pretty safe.
 172
 173 The type of encoding (base64 or quoted-printable) will be based on
 174 the \var{body_encoding} attribute.
 175 \end{methoddesc}
 176
 177 The \class{Charset} class also provides a number of methods to support
 178 standard operations and built-in functions.
 179
 180 \begin{methoddesc}[Charset]{__str__}{}
 181 Returns \var{input_charset} as a string coerced to lower case.
 182 \method{__repr__()} is an alias for \method{__str__()}.
 183 \end{methoddesc}
 184
 185 \begin{methoddesc}[Charset]{__eq__}{other}
 186 This method allows you to compare two \class{Charset} instances for equality.
 187 \end{methoddesc}
 188
 189 \begin{methoddesc}[Header]{__ne__}{other}
 190 This method allows you to compare two \class{Charset} instances for inequality.
 191 \end{methoddesc}
 192
 193 The \module{email.charset} module also provides the following
 194 functions for adding new entries to the global character set, alias,
 195 and codec registries:
 196
 197 \begin{funcdesc}{add_charset}{charset\optional{, header_enc\optional{,
 198     body_enc\optional{, output_charset}}}}
 199 Add character properties to the global registry.
 200
 201 \var{charset} is the input character set, and must be the canonical
 202 name of a character set.
 203
 204 Optional \var{header_enc} and \var{body_enc} is either
 205 \code{Charset.QP} for quoted-printable, \code{Charset.BASE64} for
 206 base64 encoding, \code{Charset.SHORTEST} for the shortest of
 207 quoted-printable or base64 encoding, or \code{None} for no encoding.
 208 \code{SHORTEST} is only valid for \var{header_enc}. The default is
 209 \code{None} for no encoding.
 210
 211 Optional \var{output_charset} is the character set that the output
 212 should be in.  Conversions will proceed from input charset, to
 213 Unicode, to the output charset when the method
 214 \method{Charset.convert()} is called.  The default is to output in the
 215 same character set as the input.
 216
 217 Both \var{input_charset} and \var{output_charset} must have Unicode
 218 codec entries in the module's character set-to-codec mapping; use
 219 \function{add_codec()} to add codecs the module does
 220 not know about.  See the \refmodule{codecs} module's documentation for
 221 more information.
 222
 223 The global character set registry is kept in the module global
 224 dictionary \code{CHARSETS}.
 225 \end{funcdesc}
 226
 227 \begin{funcdesc}{add_alias}{alias, canonical}
 228 Add a character set alias.  \var{alias} is the alias name,
 229 e.g. \code{latin-1}.  \var{canonical} is the character set's canonical
 230 name, e.g. \code{iso-8859-1}.
 231
 232 The global charset alias registry is kept in the module global
 233 dictionary \code{ALIASES}.
 234 \end{funcdesc}
 235
 236 \begin{funcdesc}{add_codec}{charset, codecname}
 237 Add a codec that map characters in the given character set to and from
 238 Unicode.
 239
 240 \var{charset} is the canonical name of a character set.
 241 \var{codecname} is the name of a Python codec, as appropriate for the
 242 second argument to the \function{unicode()} built-in, or to the
 243 \method{encode()} method of a Unicode string.
 244 \end{funcdesc}