Minor fix for currentframe (SF #1652788).
[python.git] / Doc / lib / emailcharsets.tex
blobe0be68ab7ee54b54ec78eaea515142776b0c65ce
1 \declaremodule{standard}{email.charset}
2 \modulesynopsis{Character Sets}
4 This module provides a class \class{Charset} for representing
5 character sets and character set conversions in email messages, as
6 well as a character set registry and several convenience methods for
7 manipulating this registry. Instances of \class{Charset} are used in
8 several other modules within the \module{email} package.
10 Import this class from the \module{email.charset} module.
12 \versionadded{2.2.2}
14 \begin{classdesc}{Charset}{\optional{input_charset}}
15 Map character sets to their email properties.
17 This class provides information about the requirements imposed on
18 email for a specific character set. It also provides convenience
19 routines for converting between character sets, given the availability
20 of the applicable codecs. Given a character set, it will do its best
21 to provide information on how to use that character set in an email
22 message in an RFC-compliant way.
24 Certain character sets must be encoded with quoted-printable or base64
25 when used in email headers or bodies. Certain character sets must be
26 converted outright, and are not allowed in email.
28 Optional \var{input_charset} is as described below; it is always
29 coerced to lower case. After being alias normalized it is also used
30 as a lookup into the registry of character sets to find out the header
31 encoding, body encoding, and output conversion codec to be used for
32 the character set. For example, if
33 \var{input_charset} is \code{iso-8859-1}, then headers and bodies will
34 be encoded using quoted-printable and no output conversion codec is
35 necessary. If \var{input_charset} is \code{euc-jp}, then headers will
36 be encoded with base64, bodies will not be encoded, but output text
37 will be converted from the \code{euc-jp} character set to the
38 \code{iso-2022-jp} character set.
39 \end{classdesc}
41 \class{Charset} instances have the following data attributes:
43 \begin{datadesc}{input_charset}
44 The initial character set specified. Common aliases are converted to
45 their \emph{official} email names (e.g. \code{latin_1} is converted to
46 \code{iso-8859-1}). Defaults to 7-bit \code{us-ascii}.
47 \end{datadesc}
49 \begin{datadesc}{header_encoding}
50 If the character set must be encoded before it can be used in an
51 email header, this attribute will be set to \code{Charset.QP} (for
52 quoted-printable), \code{Charset.BASE64} (for base64 encoding), or
53 \code{Charset.SHORTEST} for the shortest of QP or BASE64 encoding.
54 Otherwise, it will be \code{None}.
55 \end{datadesc}
57 \begin{datadesc}{body_encoding}
58 Same as \var{header_encoding}, but describes the encoding for the
59 mail message's body, which indeed may be different than the header
60 encoding. \code{Charset.SHORTEST} is not allowed for
61 \var{body_encoding}.
62 \end{datadesc}
64 \begin{datadesc}{output_charset}
65 Some character sets must be converted before they can be used in
66 email headers or bodies. If the \var{input_charset} is one of
67 them, this attribute will contain the name of the character set
68 output will be converted to. Otherwise, it will be \code{None}.
69 \end{datadesc}
71 \begin{datadesc}{input_codec}
72 The name of the Python codec used to convert the \var{input_charset} to
73 Unicode. If no conversion codec is necessary, this attribute will be
74 \code{None}.
75 \end{datadesc}
77 \begin{datadesc}{output_codec}
78 The name of the Python codec used to convert Unicode to the
79 \var{output_charset}. If no conversion codec is necessary, this
80 attribute will have the same value as the \var{input_codec}.
81 \end{datadesc}
83 \class{Charset} instances also have the following methods:
85 \begin{methoddesc}[Charset]{get_body_encoding}{}
86 Return the content transfer encoding used for body encoding.
88 This is either the string \samp{quoted-printable} or \samp{base64}
89 depending on the encoding used, or it is a function, in which case you
90 should call the function with a single argument, the Message object
91 being encoded. The function should then set the
92 \mailheader{Content-Transfer-Encoding} header itself to whatever is
93 appropriate.
95 Returns the string \samp{quoted-printable} if
96 \var{body_encoding} is \code{QP}, returns the string
97 \samp{base64} if \var{body_encoding} is \code{BASE64}, and returns the
98 string \samp{7bit} otherwise.
99 \end{methoddesc}
101 \begin{methoddesc}{convert}{s}
102 Convert the string \var{s} from the \var{input_codec} to the
103 \var{output_codec}.
104 \end{methoddesc}
106 \begin{methoddesc}{to_splittable}{s}
107 Convert a possibly multibyte string to a safely splittable format.
108 \var{s} is the string to split.
110 Uses the \var{input_codec} to try and convert the string to Unicode,
111 so it can be safely split on character boundaries (even for multibyte
112 characters).
114 Returns the string as-is if it isn't known how to convert \var{s} to
115 Unicode with the \var{input_charset}.
117 Characters that could not be converted to Unicode will be replaced
118 with the Unicode replacement character \character{U+FFFD}.
119 \end{methoddesc}
121 \begin{methoddesc}{from_splittable}{ustr\optional{, to_output}}
122 Convert a splittable string back into an encoded string. \var{ustr}
123 is a Unicode string to ``unsplit''.
125 This method uses the proper codec to try and convert the string from
126 Unicode back into an encoded format. Return the string as-is if it is
127 not Unicode, or if it could not be converted from Unicode.
129 Characters that could not be converted from Unicode will be replaced
130 with an appropriate character (usually \character{?}).
132 If \var{to_output} is \code{True} (the default), uses
133 \var{output_codec} to convert to an
134 encoded format. If \var{to_output} is \code{False}, it uses
135 \var{input_codec}.
136 \end{methoddesc}
138 \begin{methoddesc}{get_output_charset}{}
139 Return the output character set.
141 This is the \var{output_charset} attribute if that is not \code{None},
142 otherwise it is \var{input_charset}.
143 \end{methoddesc}
145 \begin{methoddesc}{encoded_header_len}{}
146 Return the length of the encoded header string, properly calculating
147 for quoted-printable or base64 encoding.
148 \end{methoddesc}
150 \begin{methoddesc}{header_encode}{s\optional{, convert}}
151 Header-encode the string \var{s}.
153 If \var{convert} is \code{True}, the string will be converted from the
154 input charset to the output charset automatically. This is not useful
155 for multibyte character sets, which have line length issues (multibyte
156 characters must be split on a character, not a byte boundary); use the
157 higher-level \class{Header} class to deal with these issues (see
158 \refmodule{email.header}). \var{convert} defaults to \code{False}.
160 The type of encoding (base64 or quoted-printable) will be based on
161 the \var{header_encoding} attribute.
162 \end{methoddesc}
164 \begin{methoddesc}{body_encode}{s\optional{, convert}}
165 Body-encode the string \var{s}.
167 If \var{convert} is \code{True} (the default), the string will be
168 converted from the input charset to output charset automatically.
169 Unlike \method{header_encode()}, there are no issues with byte
170 boundaries and multibyte charsets in email bodies, so this is usually
171 pretty safe.
173 The type of encoding (base64 or quoted-printable) will be based on
174 the \var{body_encoding} attribute.
175 \end{methoddesc}
177 The \class{Charset} class also provides a number of methods to support
178 standard operations and built-in functions.
180 \begin{methoddesc}[Charset]{__str__}{}
181 Returns \var{input_charset} as a string coerced to lower case.
182 \method{__repr__()} is an alias for \method{__str__()}.
183 \end{methoddesc}
185 \begin{methoddesc}[Charset]{__eq__}{other}
186 This method allows you to compare two \class{Charset} instances for equality.
187 \end{methoddesc}
189 \begin{methoddesc}[Header]{__ne__}{other}
190 This method allows you to compare two \class{Charset} instances for inequality.
191 \end{methoddesc}
193 The \module{email.charset} module also provides the following
194 functions for adding new entries to the global character set, alias,
195 and codec registries:
197 \begin{funcdesc}{add_charset}{charset\optional{, header_enc\optional{,
198 body_enc\optional{, output_charset}}}}
199 Add character properties to the global registry.
201 \var{charset} is the input character set, and must be the canonical
202 name of a character set.
204 Optional \var{header_enc} and \var{body_enc} is either
205 \code{Charset.QP} for quoted-printable, \code{Charset.BASE64} for
206 base64 encoding, \code{Charset.SHORTEST} for the shortest of
207 quoted-printable or base64 encoding, or \code{None} for no encoding.
208 \code{SHORTEST} is only valid for \var{header_enc}. The default is
209 \code{None} for no encoding.
211 Optional \var{output_charset} is the character set that the output
212 should be in. Conversions will proceed from input charset, to
213 Unicode, to the output charset when the method
214 \method{Charset.convert()} is called. The default is to output in the
215 same character set as the input.
217 Both \var{input_charset} and \var{output_charset} must have Unicode
218 codec entries in the module's character set-to-codec mapping; use
219 \function{add_codec()} to add codecs the module does
220 not know about. See the \refmodule{codecs} module's documentation for
221 more information.
223 The global character set registry is kept in the module global
224 dictionary \code{CHARSETS}.
225 \end{funcdesc}
227 \begin{funcdesc}{add_alias}{alias, canonical}
228 Add a character set alias. \var{alias} is the alias name,
229 e.g. \code{latin-1}. \var{canonical} is the character set's canonical
230 name, e.g. \code{iso-8859-1}.
232 The global charset alias registry is kept in the module global
233 dictionary \code{ALIASES}.
234 \end{funcdesc}
236 \begin{funcdesc}{add_codec}{charset, codecname}
237 Add a codec that map characters in the given character set to and from
238 Unicode.
240 \var{charset} is the canonical name of a character set.
241 \var{codecname} is the name of a Python codec, as appropriate for the
242 second argument to the \function{unicode()} built-in, or to the
243 \method{encode()} method of a Unicode string.
244 \end{funcdesc}