Added a test for the ability to specify a class attribute in Formatter configuration...
[python.git] / Doc / lib / libstring.tex
blob1828b2e1cd779da6fd99563b3b2eae20c0e32e0e
1 \section{\module{string} ---
2 Common string operations}
4 \declaremodule{standard}{string}
5 \modulesynopsis{Common string operations.}
7 The \module{string} module contains a number of useful constants and classes,
8 as well as some deprecated legacy functions that are also available as methods
9 on strings. See the module \refmodule{re}\refstmodindex{re} for string
10 functions based on regular expressions.
12 \subsection{String constants}
14 The constants defined in this module are:
16 \begin{datadesc}{ascii_letters}
17 The concatenation of the \constant{ascii_lowercase} and
18 \constant{ascii_uppercase} constants described below. This value is
19 not locale-dependent.
20 \end{datadesc}
22 \begin{datadesc}{ascii_lowercase}
23 The lowercase letters \code{'abcdefghijklmnopqrstuvwxyz'}. This
24 value is not locale-dependent and will not change.
25 \end{datadesc}
27 \begin{datadesc}{ascii_uppercase}
28 The uppercase letters \code{'ABCDEFGHIJKLMNOPQRSTUVWXYZ'}. This
29 value is not locale-dependent and will not change.
30 \end{datadesc}
32 \begin{datadesc}{digits}
33 The string \code{'0123456789'}.
34 \end{datadesc}
36 \begin{datadesc}{hexdigits}
37 The string \code{'0123456789abcdefABCDEF'}.
38 \end{datadesc}
40 \begin{datadesc}{letters}
41 The concatenation of the strings \constant{lowercase} and
42 \constant{uppercase} described below. The specific value is
43 locale-dependent, and will be updated when
44 \function{locale.setlocale()} is called.
45 \end{datadesc}
47 \begin{datadesc}{lowercase}
48 A string containing all the characters that are considered lowercase
49 letters. On most systems this is the string
50 \code{'abcdefghijklmnopqrstuvwxyz'}. Do not change its definition ---
51 the effect on the routines \function{upper()} and
52 \function{swapcase()} is undefined. The specific value is
53 locale-dependent, and will be updated when
54 \function{locale.setlocale()} is called.
55 \end{datadesc}
57 \begin{datadesc}{octdigits}
58 The string \code{'01234567'}.
59 \end{datadesc}
61 \begin{datadesc}{punctuation}
62 String of \ASCII{} characters which are considered punctuation
63 characters in the \samp{C} locale.
64 \end{datadesc}
66 \begin{datadesc}{printable}
67 String of characters which are considered printable. This is a
68 combination of \constant{digits}, \constant{letters},
69 \constant{punctuation}, and \constant{whitespace}.
70 \end{datadesc}
72 \begin{datadesc}{uppercase}
73 A string containing all the characters that are considered uppercase
74 letters. On most systems this is the string
75 \code{'ABCDEFGHIJKLMNOPQRSTUVWXYZ'}. Do not change its definition ---
76 the effect on the routines \function{lower()} and
77 \function{swapcase()} is undefined. The specific value is
78 locale-dependent, and will be updated when
79 \function{locale.setlocale()} is called.
80 \end{datadesc}
82 \begin{datadesc}{whitespace}
83 A string containing all characters that are considered whitespace.
84 On most systems this includes the characters space, tab, linefeed,
85 return, formfeed, and vertical tab. Do not change its definition ---
86 the effect on the routines \function{strip()} and \function{split()}
87 is undefined.
88 \end{datadesc}
90 \subsection{Template strings}
92 Templates provide simpler string substitutions as described in \pep{292}.
93 Instead of the normal \samp{\%}-based substitutions, Templates support
94 \samp{\$}-based substitutions, using the following rules:
96 \begin{itemize}
97 \item \samp{\$\$} is an escape; it is replaced with a single \samp{\$}.
99 \item \samp{\$identifier} names a substitution placeholder matching a mapping
100 key of "identifier". By default, "identifier" must spell a Python
101 identifier. The first non-identifier character after the \samp{\$}
102 character terminates this placeholder specification.
104 \item \samp{\$\{identifier\}} is equivalent to \samp{\$identifier}. It is
105 required when valid identifier characters follow the placeholder but are
106 not part of the placeholder, such as "\$\{noun\}ification".
107 \end{itemize}
109 Any other appearance of \samp{\$} in the string will result in a
110 \exception{ValueError} being raised.
112 \versionadded{2.4}
114 The \module{string} module provides a \class{Template} class that implements
115 these rules. The methods of \class{Template} are:
117 \begin{classdesc}{Template}{template}
118 The constructor takes a single argument which is the template string.
119 \end{classdesc}
121 \begin{methoddesc}[Template]{substitute}{mapping\optional{, **kws}}
122 Performs the template substitution, returning a new string. \var{mapping} is
123 any dictionary-like object with keys that match the placeholders in the
124 template. Alternatively, you can provide keyword arguments, where the
125 keywords are the placeholders. When both \var{mapping} and \var{kws} are
126 given and there are duplicates, the placeholders from \var{kws} take
127 precedence.
128 \end{methoddesc}
130 \begin{methoddesc}[Template]{safe_substitute}{mapping\optional{, **kws}}
131 Like \method{substitute()}, except that if placeholders are missing from
132 \var{mapping} and \var{kws}, instead of raising a \exception{KeyError}
133 exception, the original placeholder will appear in the resulting string
134 intact. Also, unlike with \method{substitute()}, any other appearances of the
135 \samp{\$} will simply return \samp{\$} instead of raising
136 \exception{ValueError}.
138 While other exceptions may still occur, this method is called ``safe'' because
139 substitutions always tries to return a usable string instead of raising an
140 exception. In another sense, \method{safe_substitute()} may be anything other
141 than safe, since it will silently ignore malformed templates containing
142 dangling delimiters, unmatched braces, or placeholders that are not valid
143 Python identifiers.
144 \end{methoddesc}
146 \class{Template} instances also provide one public data attribute:
148 \begin{memberdesc}[string]{template}
149 This is the object passed to the constructor's \var{template} argument. In
150 general, you shouldn't change it, but read-only access is not enforced.
151 \end{memberdesc}
153 Here is an example of how to use a Template:
155 \begin{verbatim}
156 >>> from string import Template
157 >>> s = Template('$who likes $what')
158 >>> s.substitute(who='tim', what='kung pao')
159 'tim likes kung pao'
160 >>> d = dict(who='tim')
161 >>> Template('Give $who $100').substitute(d)
162 Traceback (most recent call last):
163 [...]
164 ValueError: Invalid placeholder in string: line 1, col 10
165 >>> Template('$who likes $what').substitute(d)
166 Traceback (most recent call last):
167 [...]
168 KeyError: 'what'
169 >>> Template('$who likes $what').safe_substitute(d)
170 'tim likes $what'
171 \end{verbatim}
173 Advanced usage: you can derive subclasses of \class{Template} to customize the
174 placeholder syntax, delimiter character, or the entire regular expression used
175 to parse template strings. To do this, you can override these class
176 attributes:
178 \begin{itemize}
179 \item \var{delimiter} -- This is the literal string describing a placeholder
180 introducing delimiter. The default value \samp{\$}. Note that this
181 should \emph{not} be a regular expression, as the implementation will
182 call \method{re.escape()} on this string as needed.
183 \item \var{idpattern} -- This is the regular expression describing the pattern
184 for non-braced placeholders (the braces will be added automatically as
185 appropriate). The default value is the regular expression
186 \samp{[_a-z][_a-z0-9]*}.
187 \end{itemize}
189 Alternatively, you can provide the entire regular expression pattern by
190 overriding the class attribute \var{pattern}. If you do this, the value must
191 be a regular expression object with four named capturing groups. The
192 capturing groups correspond to the rules given above, along with the invalid
193 placeholder rule:
195 \begin{itemize}
196 \item \var{escaped} -- This group matches the escape sequence,
197 e.g. \samp{\$\$}, in the default pattern.
198 \item \var{named} -- This group matches the unbraced placeholder name; it
199 should not include the delimiter in capturing group.
200 \item \var{braced} -- This group matches the brace enclosed placeholder name;
201 it should not include either the delimiter or braces in the capturing
202 group.
203 \item \var{invalid} -- This group matches any other delimiter pattern (usually
204 a single delimiter), and it should appear last in the regular
205 expression.
206 \end{itemize}
208 \subsection{String functions}
210 The following functions are available to operate on string and Unicode
211 objects. They are not available as string methods.
213 \begin{funcdesc}{capwords}{s}
214 Split the argument into words using \function{split()}, capitalize
215 each word using \function{capitalize()}, and join the capitalized
216 words using \function{join()}. Note that this replaces runs of
217 whitespace characters by a single space, and removes leading and
218 trailing whitespace.
219 \end{funcdesc}
221 \begin{funcdesc}{maketrans}{from, to}
222 Return a translation table suitable for passing to
223 \function{translate()} or \function{regex.compile()}, that will map
224 each character in \var{from} into the character at the same position
225 in \var{to}; \var{from} and \var{to} must have the same length.
227 \warning{Don't use strings derived from \constant{lowercase}
228 and \constant{uppercase} as arguments; in some locales, these don't have
229 the same length. For case conversions, always use
230 \function{lower()} and \function{upper()}.}
231 \end{funcdesc}
233 \subsection{Deprecated string functions}
235 The following list of functions are also defined as methods of string and
236 Unicode objects; see ``String Methods'' (section
237 \ref{string-methods}) for more information on those. You should consider
238 these functions as deprecated, although they will not be removed until Python
239 3.0. The functions defined in this module are:
241 \begin{funcdesc}{atof}{s}
242 \deprecated{2.0}{Use the \function{float()} built-in function.}
243 Convert a string to a floating point number. The string must have
244 the standard syntax for a floating point literal in Python,
245 optionally preceded by a sign (\samp{+} or \samp{-}). Note that
246 this behaves identical to the built-in function
247 \function{float()}\bifuncindex{float} when passed a string.
249 \note{When passing in a string, values for NaN\index{NaN}
250 and Infinity\index{Infinity} may be returned, depending on the
251 underlying C library. The specific set of strings accepted which
252 cause these values to be returned depends entirely on the C library
253 and is known to vary.}
254 \end{funcdesc}
256 \begin{funcdesc}{atoi}{s\optional{, base}}
257 \deprecated{2.0}{Use the \function{int()} built-in function.}
258 Convert string \var{s} to an integer in the given \var{base}. The
259 string must consist of one or more digits, optionally preceded by a
260 sign (\samp{+} or \samp{-}). The \var{base} defaults to 10. If it
261 is 0, a default base is chosen depending on the leading characters
262 of the string (after stripping the sign): \samp{0x} or \samp{0X}
263 means 16, \samp{0} means 8, anything else means 10. If \var{base}
264 is 16, a leading \samp{0x} or \samp{0X} is always accepted, though
265 not required. This behaves identically to the built-in function
266 \function{int()} when passed a string. (Also note: for a more
267 flexible interpretation of numeric literals, use the built-in
268 function \function{eval()}\bifuncindex{eval}.)
269 \end{funcdesc}
271 \begin{funcdesc}{atol}{s\optional{, base}}
272 \deprecated{2.0}{Use the \function{long()} built-in function.}
273 Convert string \var{s} to a long integer in the given \var{base}.
274 The string must consist of one or more digits, optionally preceded
275 by a sign (\samp{+} or \samp{-}). The \var{base} argument has the
276 same meaning as for \function{atoi()}. A trailing \samp{l} or
277 \samp{L} is not allowed, except if the base is 0. Note that when
278 invoked without \var{base} or with \var{base} set to 10, this
279 behaves identical to the built-in function
280 \function{long()}\bifuncindex{long} when passed a string.
281 \end{funcdesc}
283 \begin{funcdesc}{capitalize}{word}
284 Return a copy of \var{word} with only its first character capitalized.
285 \end{funcdesc}
287 \begin{funcdesc}{expandtabs}{s\optional{, tabsize}}
288 Expand tabs in a string replacing them by one or more spaces,
289 depending on the current column and the given tab size. The column
290 number is reset to zero after each newline occurring in the string.
291 This doesn't understand other non-printing characters or escape
292 sequences. The tab size defaults to 8.
293 \end{funcdesc}
295 \begin{funcdesc}{find}{s, sub\optional{, start\optional{,end}}}
296 Return the lowest index in \var{s} where the substring \var{sub} is
297 found such that \var{sub} is wholly contained in
298 \code{\var{s}[\var{start}:\var{end}]}. Return \code{-1} on failure.
299 Defaults for \var{start} and \var{end} and interpretation of
300 negative values is the same as for slices.
301 \end{funcdesc}
303 \begin{funcdesc}{rfind}{s, sub\optional{, start\optional{, end}}}
304 Like \function{find()} but find the highest index.
305 \end{funcdesc}
307 \begin{funcdesc}{index}{s, sub\optional{, start\optional{, end}}}
308 Like \function{find()} but raise \exception{ValueError} when the
309 substring is not found.
310 \end{funcdesc}
312 \begin{funcdesc}{rindex}{s, sub\optional{, start\optional{, end}}}
313 Like \function{rfind()} but raise \exception{ValueError} when the
314 substring is not found.
315 \end{funcdesc}
317 \begin{funcdesc}{count}{s, sub\optional{, start\optional{, end}}}
318 Return the number of (non-overlapping) occurrences of substring
319 \var{sub} in string \code{\var{s}[\var{start}:\var{end}]}.
320 Defaults for \var{start} and \var{end} and interpretation of
321 negative values are the same as for slices.
322 \end{funcdesc}
324 \begin{funcdesc}{lower}{s}
325 Return a copy of \var{s}, but with upper case letters converted to
326 lower case.
327 \end{funcdesc}
329 \begin{funcdesc}{split}{s\optional{, sep\optional{, maxsplit}}}
330 Return a list of the words of the string \var{s}. If the optional
331 second argument \var{sep} is absent or \code{None}, the words are
332 separated by arbitrary strings of whitespace characters (space, tab,
333 newline, return, formfeed). If the second argument \var{sep} is
334 present and not \code{None}, it specifies a string to be used as the
335 word separator. The returned list will then have one more item
336 than the number of non-overlapping occurrences of the separator in
337 the string. The optional third argument \var{maxsplit} defaults to
338 0. If it is nonzero, at most \var{maxsplit} number of splits occur,
339 and the remainder of the string is returned as the final element of
340 the list (thus, the list will have at most \code{\var{maxsplit}+1}
341 elements).
343 The behavior of split on an empty string depends on the value of \var{sep}.
344 If \var{sep} is not specified, or specified as \code{None}, the result will
345 be an empty list. If \var{sep} is specified as any string, the result will
346 be a list containing one element which is an empty string.
347 \end{funcdesc}
349 \begin{funcdesc}{rsplit}{s\optional{, sep\optional{, maxsplit}}}
350 Return a list of the words of the string \var{s}, scanning \var{s}
351 from the end. To all intents and purposes, the resulting list of
352 words is the same as returned by \function{split()}, except when the
353 optional third argument \var{maxsplit} is explicitly specified and
354 nonzero. When \var{maxsplit} is nonzero, at most \var{maxsplit}
355 number of splits -- the \emph{rightmost} ones -- occur, and the remainder
356 of the string is returned as the first element of the list (thus, the
357 list will have at most \code{\var{maxsplit}+1} elements).
358 \versionadded{2.4}
359 \end{funcdesc}
361 \begin{funcdesc}{splitfields}{s\optional{, sep\optional{, maxsplit}}}
362 This function behaves identically to \function{split()}. (In the
363 past, \function{split()} was only used with one argument, while
364 \function{splitfields()} was only used with two arguments.)
365 \end{funcdesc}
367 \begin{funcdesc}{join}{words\optional{, sep}}
368 Concatenate a list or tuple of words with intervening occurrences of
369 \var{sep}. The default value for \var{sep} is a single space
370 character. It is always true that
371 \samp{string.join(string.split(\var{s}, \var{sep}), \var{sep})}
372 equals \var{s}.
373 \end{funcdesc}
375 \begin{funcdesc}{joinfields}{words\optional{, sep}}
376 This function behaves identically to \function{join()}. (In the past,
377 \function{join()} was only used with one argument, while
378 \function{joinfields()} was only used with two arguments.)
379 Note that there is no \method{joinfields()} method on string
380 objects; use the \method{join()} method instead.
381 \end{funcdesc}
383 \begin{funcdesc}{lstrip}{s\optional{, chars}}
384 Return a copy of the string with leading characters removed. If
385 \var{chars} is omitted or \code{None}, whitespace characters are
386 removed. If given and not \code{None}, \var{chars} must be a string;
387 the characters in the string will be stripped from the beginning of
388 the string this method is called on.
389 \versionchanged[The \var{chars} parameter was added. The \var{chars}
390 parameter cannot be passed in earlier 2.2 versions]{2.2.3}
391 \end{funcdesc}
393 \begin{funcdesc}{rstrip}{s\optional{, chars}}
394 Return a copy of the string with trailing characters removed. If
395 \var{chars} is omitted or \code{None}, whitespace characters are
396 removed. If given and not \code{None}, \var{chars} must be a string;
397 the characters in the string will be stripped from the end of the
398 string this method is called on.
399 \versionchanged[The \var{chars} parameter was added. The \var{chars}
400 parameter cannot be passed in earlier 2.2 versions]{2.2.3}
401 \end{funcdesc}
403 \begin{funcdesc}{strip}{s\optional{, chars}}
404 Return a copy of the string with leading and trailing characters
405 removed. If \var{chars} is omitted or \code{None}, whitespace
406 characters are removed. If given and not \code{None}, \var{chars}
407 must be a string; the characters in the string will be stripped from
408 the both ends of the string this method is called on.
409 \versionchanged[The \var{chars} parameter was added. The \var{chars}
410 parameter cannot be passed in earlier 2.2 versions]{2.2.3}
411 \end{funcdesc}
413 \begin{funcdesc}{swapcase}{s}
414 Return a copy of \var{s}, but with lower case letters
415 converted to upper case and vice versa.
416 \end{funcdesc}
418 \begin{funcdesc}{translate}{s, table\optional{, deletechars}}
419 Delete all characters from \var{s} that are in \var{deletechars} (if
420 present), and then translate the characters using \var{table}, which
421 must be a 256-character string giving the translation for each
422 character value, indexed by its ordinal.
423 \end{funcdesc}
425 \begin{funcdesc}{upper}{s}
426 Return a copy of \var{s}, but with lower case letters converted to
427 upper case.
428 \end{funcdesc}
430 \begin{funcdesc}{ljust}{s, width}
431 \funcline{rjust}{s, width}
432 \funcline{center}{s, width}
433 These functions respectively left-justify, right-justify and center
434 a string in a field of given width. They return a string that is at
435 least \var{width} characters wide, created by padding the string
436 \var{s} with spaces until the given width on the right, left or both
437 sides. The string is never truncated.
438 \end{funcdesc}
440 \begin{funcdesc}{zfill}{s, width}
441 Pad a numeric string on the left with zero digits until the given
442 width is reached. Strings starting with a sign are handled
443 correctly.
444 \end{funcdesc}
446 \begin{funcdesc}{replace}{str, old, new\optional{, maxreplace}}
447 Return a copy of string \var{str} with all occurrences of substring
448 \var{old} replaced by \var{new}. If the optional argument
449 \var{maxreplace} is given, the first \var{maxreplace} occurrences are
450 replaced.
451 \end{funcdesc}