Doc/lib/libstring.tex

   1 \section{\module{string} ---
   2          Common string operations}
   3
   4 \declaremodule{standard}{string}
   5 \modulesynopsis{Common string operations.}
   6
   7 The \module{string} module contains a number of useful constants and classes,
   8 as well as some deprecated legacy functions that are also available as methods
   9 on strings.  See the module \refmodule{re}\refstmodindex{re} for string
  10 functions based on regular expressions.
  11
  12 \subsection{String constants}
  13
  14 The constants defined in this module are:
  15
  16 \begin{datadesc}{ascii_letters}
  17   The concatenation of the \constant{ascii_lowercase} and
  18   \constant{ascii_uppercase} constants described below.  This value is
  19   not locale-dependent.
  20 \end{datadesc}
  21
  22 \begin{datadesc}{ascii_lowercase}
  23   The lowercase letters \code{'abcdefghijklmnopqrstuvwxyz'}.  This
  24   value is not locale-dependent and will not change.
  25 \end{datadesc}
  26
  27 \begin{datadesc}{ascii_uppercase}
  28   The uppercase letters \code{'ABCDEFGHIJKLMNOPQRSTUVWXYZ'}.  This
  29   value is not locale-dependent and will not change.
  30 \end{datadesc}
  31
  32 \begin{datadesc}{digits}
  33   The string \code{'0123456789'}.
  34 \end{datadesc}
  35
  36 \begin{datadesc}{hexdigits}
  37   The string \code{'0123456789abcdefABCDEF'}.
  38 \end{datadesc}
  39
  40 \begin{datadesc}{letters}
  41   The concatenation of the strings \constant{lowercase} and
  42   \constant{uppercase} described below.  The specific value is
  43   locale-dependent, and will be updated when
  44   \function{locale.setlocale()} is called.
  45 \end{datadesc}
  46
  47 \begin{datadesc}{lowercase}
  48   A string containing all the characters that are considered lowercase
  49   letters.  On most systems this is the string
  50   \code{'abcdefghijklmnopqrstuvwxyz'}.  Do not change its definition ---
  51   the effect on the routines \function{upper()} and
  52   \function{swapcase()} is undefined.  The specific value is
  53   locale-dependent, and will be updated when
  54   \function{locale.setlocale()} is called.
  55 \end{datadesc}
  56
  57 \begin{datadesc}{octdigits}
  58   The string \code{'01234567'}.
  59 \end{datadesc}
  60
  61 \begin{datadesc}{punctuation}
  62   String of \ASCII{} characters which are considered punctuation
  63   characters in the \samp{C} locale.
  64 \end{datadesc}
  65
  66 \begin{datadesc}{printable}
  67   String of characters which are considered printable.  This is a
  68   combination of \constant{digits}, \constant{letters},
  69   \constant{punctuation}, and \constant{whitespace}.
  70 \end{datadesc}
  71
  72 \begin{datadesc}{uppercase}
  73   A string containing all the characters that are considered uppercase
  74   letters.  On most systems this is the string
  75   \code{'ABCDEFGHIJKLMNOPQRSTUVWXYZ'}.  Do not change its definition ---
  76   the effect on the routines \function{lower()} and
  77   \function{swapcase()} is undefined.  The specific value is
  78   locale-dependent, and will be updated when
  79   \function{locale.setlocale()} is called.
  80 \end{datadesc}
  81
  82 \begin{datadesc}{whitespace}
  83   A string containing all characters that are considered whitespace.
  84   On most systems this includes the characters space, tab, linefeed,
  85   return, formfeed, and vertical tab.  Do not change its definition ---
  86   the effect on the routines \function{strip()} and \function{split()}
  87   is undefined.
  88 \end{datadesc}
  89
  90 \subsection{Template strings}
  91
  92 Templates provide simpler string substitutions as described in \pep{292}.
  93 Instead of the normal \samp{\%}-based substitutions, Templates support
  94 \samp{\$}-based substitutions, using the following rules:
  95
  96 \begin{itemize}
  97 \item \samp{\$\$} is an escape; it is replaced with a single \samp{\$}.
  98
  99 \item \samp{\$identifier} names a substitution placeholder matching a mapping
 100        key of "identifier".  By default, "identifier" must spell a Python
 101        identifier.  The first non-identifier character after the \samp{\$}
 102        character terminates this placeholder specification.
 103
 104 \item \samp{\$\{identifier\}} is equivalent to \samp{\$identifier}.  It is
 105       required when valid identifier characters follow the placeholder but are
 106       not part of the placeholder, such as "\$\{noun\}ification".
 107 \end{itemize}
 108
 109 Any other appearance of \samp{\$} in the string will result in a
 110 \exception{ValueError} being raised.
 111
 112 \versionadded{2.4}
 113
 114 The \module{string} module provides a \class{Template} class that implements
 115 these rules.  The methods of \class{Template} are:
 116
 117 \begin{classdesc}{Template}{template}
 118 The constructor takes a single argument which is the template string.
 119 \end{classdesc}
 120
 121 \begin{methoddesc}[Template]{substitute}{mapping\optional{, **kws}}
 122 Performs the template substitution, returning a new string.  \var{mapping} is
 123 any dictionary-like object with keys that match the placeholders in the
 124 template.  Alternatively, you can provide keyword arguments, where the
 125 keywords are the placeholders.  When both \var{mapping} and \var{kws} are
 126 given and there are duplicates, the placeholders from \var{kws} take
 127 precedence.
 128 \end{methoddesc}
 129
 130 \begin{methoddesc}[Template]{safe_substitute}{mapping\optional{, **kws}}
 131 Like \method{substitute()}, except that if placeholders are missing from
 132 \var{mapping} and \var{kws}, instead of raising a \exception{KeyError}
 133 exception, the original placeholder will appear in the resulting string
 134 intact.  Also, unlike with \method{substitute()}, any other appearances of the
 135 \samp{\$} will simply return \samp{\$} instead of raising
 136 \exception{ValueError}.
 137
 138 While other exceptions may still occur, this method is called ``safe'' because
 139 substitutions always tries to return a usable string instead of raising an
 140 exception.  In another sense, \method{safe_substitute()} may be anything other
 141 than safe, since it will silently ignore malformed templates containing
 142 dangling delimiters, unmatched braces, or placeholders that are not valid
 143 Python identifiers.
 144 \end{methoddesc}
 145
 146 \class{Template} instances also provide one public data attribute:
 147
 148 \begin{memberdesc}[string]{template}
 149 This is the object passed to the constructor's \var{template} argument.  In
 150 general, you shouldn't change it, but read-only access is not enforced.
 151 \end{memberdesc}
 152
 153 Here is an example of how to use a Template:
 154
 155 \begin{verbatim}
 156 >>> from string import Template
 157 >>> s = Template('$who likes $what')
 158 >>> s.substitute(who='tim', what='kung pao')
 159 'tim likes kung pao'
 160 >>> d = dict(who='tim')
 161 >>> Template('Give $who $100').substitute(d)
 162 Traceback (most recent call last):
 163 [...]
 164 ValueError: Invalid placeholder in string: line 1, col 10
 165 >>> Template('$who likes $what').substitute(d)
 166 Traceback (most recent call last):
 167 [...]
 168 KeyError: 'what'
 169 >>> Template('$who likes $what').safe_substitute(d)
 170 'tim likes $what'
 171 \end{verbatim}
 172
 173 Advanced usage: you can derive subclasses of \class{Template} to customize the
 174 placeholder syntax, delimiter character, or the entire regular expression used
 175 to parse template strings.  To do this, you can override these class
 176 attributes:
 177
 178 \begin{itemize}
 179 \item \var{delimiter} -- This is the literal string describing a placeholder
 180       introducing delimiter.  The default value \samp{\$}.  Note that this
 181       should \emph{not} be a regular expression, as the implementation will
 182       call \method{re.escape()} on this string as needed.
 183 \item \var{idpattern} -- This is the regular expression describing the pattern
 184       for non-braced placeholders (the braces will be added automatically as
 185       appropriate).  The default value is the regular expression
 186       \samp{[_a-z][_a-z0-9]*}.
 187 \end{itemize}
 188
 189 Alternatively, you can provide the entire regular expression pattern by
 190 overriding the class attribute \var{pattern}.  If you do this, the value must
 191 be a regular expression object with four named capturing groups.  The
 192 capturing groups correspond to the rules given above, along with the invalid
 193 placeholder rule:
 194
 195 \begin{itemize}
 196 \item \var{escaped} -- This group matches the escape sequence,
 197       e.g. \samp{\$\$}, in the default pattern.
 198 \item \var{named} -- This group matches the unbraced placeholder name; it
 199       should not include the delimiter in capturing group.
 200 \item \var{braced} -- This group matches the brace enclosed placeholder name;
 201       it should not include either the delimiter or braces in the capturing
 202       group.
 203 \item \var{invalid} -- This group matches any other delimiter pattern (usually
 204       a single delimiter), and it should appear last in the regular
 205       expression.
 206 \end{itemize}
 207
 208 \subsection{String functions}
 209
 210 The following functions are available to operate on string and Unicode
 211 objects.  They are not available as string methods.
 212
 213 \begin{funcdesc}{capwords}{s}
 214   Split the argument into words using \function{split()}, capitalize
 215   each word using \function{capitalize()}, and join the capitalized
 216   words using \function{join()}.  Note that this replaces runs of
 217   whitespace characters by a single space, and removes leading and
 218   trailing whitespace.
 219 \end{funcdesc}
 220
 221 \begin{funcdesc}{maketrans}{from, to}
 222   Return a translation table suitable for passing to
 223   \function{translate()} or \function{regex.compile()}, that will map
 224   each character in \var{from} into the character at the same position
 225   in \var{to}; \var{from} and \var{to} must have the same length.
 226
 227   \warning{Don't use strings derived from \constant{lowercase}
 228   and \constant{uppercase} as arguments; in some locales, these don't have
 229   the same length.  For case conversions, always use
 230   \function{lower()} and \function{upper()}.}
 231 \end{funcdesc}
 232
 233 \subsection{Deprecated string functions}
 234
 235 The following list of functions are also defined as methods of string and
 236 Unicode objects; see ``String Methods'' (section
 237 \ref{string-methods}) for more information on those.  You should consider
 238 these functions as deprecated, although they will not be removed until Python
 239 3.0.  The functions defined in this module are:
 240
 241 \begin{funcdesc}{atof}{s}
 242   \deprecated{2.0}{Use the \function{float()} built-in function.}
 243   Convert a string to a floating point number.  The string must have
 244   the standard syntax for a floating point literal in Python,
 245   optionally preceded by a sign (\samp{+} or \samp{-}).  Note that
 246   this behaves identical to the built-in function
 247   \function{float()}\bifuncindex{float} when passed a string.
 248
 249   \note{When passing in a string, values for NaN\index{NaN}
 250   and Infinity\index{Infinity} may be returned, depending on the
 251   underlying C library.  The specific set of strings accepted which
 252   cause these values to be returned depends entirely on the C library
 253   and is known to vary.}
 254 \end{funcdesc}
 255
 256 \begin{funcdesc}{atoi}{s\optional{, base}}
 257   \deprecated{2.0}{Use the \function{int()} built-in function.}
 258   Convert string \var{s} to an integer in the given \var{base}.  The
 259   string must consist of one or more digits, optionally preceded by a
 260   sign (\samp{+} or \samp{-}).  The \var{base} defaults to 10.  If it
 261   is 0, a default base is chosen depending on the leading characters
 262   of the string (after stripping the sign): \samp{0x} or \samp{0X}
 263   means 16, \samp{0} means 8, anything else means 10.  If \var{base}
 264   is 16, a leading \samp{0x} or \samp{0X} is always accepted, though
 265   not required.  This behaves identically to the built-in function
 266   \function{int()} when passed a string.  (Also note: for a more
 267   flexible interpretation of numeric literals, use the built-in
 268   function \function{eval()}\bifuncindex{eval}.)
 269 \end{funcdesc}
 270
 271 \begin{funcdesc}{atol}{s\optional{, base}}
 272   \deprecated{2.0}{Use the \function{long()} built-in function.}
 273   Convert string \var{s} to a long integer in the given \var{base}.
 274   The string must consist of one or more digits, optionally preceded
 275   by a sign (\samp{+} or \samp{-}).  The \var{base} argument has the
 276   same meaning as for \function{atoi()}.  A trailing \samp{l} or
 277   \samp{L} is not allowed, except if the base is 0.  Note that when
 278   invoked without \var{base} or with \var{base} set to 10, this
 279   behaves identical to the built-in function
 280   \function{long()}\bifuncindex{long} when passed a string.
 281 \end{funcdesc}
 282
 283 \begin{funcdesc}{capitalize}{word}
 284   Return a copy of \var{word} with only its first character capitalized.
 285 \end{funcdesc}
 286
 287 \begin{funcdesc}{expandtabs}{s\optional{, tabsize}}
 288   Expand tabs in a string replacing them by one or more spaces,
 289   depending on the current column and the given tab size.  The column
 290   number is reset to zero after each newline occurring in the string.
 291   This doesn't understand other non-printing characters or escape
 292   sequences.  The tab size defaults to 8.
 293 \end{funcdesc}
 294
 295 \begin{funcdesc}{find}{s, sub\optional{, start\optional{,end}}}
 296   Return the lowest index in \var{s} where the substring \var{sub} is
 297   found such that \var{sub} is wholly contained in
 298   \code{\var{s}[\var{start}:\var{end}]}.  Return \code{-1} on failure.
 299   Defaults for \var{start} and \var{end} and interpretation of
 300   negative values is the same as for slices.
 301 \end{funcdesc}
 302
 303 \begin{funcdesc}{rfind}{s, sub\optional{, start\optional{, end}}}
 304   Like \function{find()} but find the highest index.
 305 \end{funcdesc}
 306
 307 \begin{funcdesc}{index}{s, sub\optional{, start\optional{, end}}}
 308   Like \function{find()} but raise \exception{ValueError} when the
 309   substring is not found.
 310 \end{funcdesc}
 311
 312 \begin{funcdesc}{rindex}{s, sub\optional{, start\optional{, end}}}
 313   Like \function{rfind()} but raise \exception{ValueError} when the
 314   substring is not found.
 315 \end{funcdesc}
 316
 317 \begin{funcdesc}{count}{s, sub\optional{, start\optional{, end}}}
 318   Return the number of (non-overlapping) occurrences of substring
 319   \var{sub} in string \code{\var{s}[\var{start}:\var{end}]}.
 320   Defaults for \var{start} and \var{end} and interpretation of
 321   negative values are the same as for slices.
 322 \end{funcdesc}
 323
 324 \begin{funcdesc}{lower}{s}
 325   Return a copy of \var{s}, but with upper case letters converted to
 326   lower case.
 327 \end{funcdesc}
 328
 329 \begin{funcdesc}{split}{s\optional{, sep\optional{, maxsplit}}}
 330   Return a list of the words of the string \var{s}.  If the optional
 331   second argument \var{sep} is absent or \code{None}, the words are
 332   separated by arbitrary strings of whitespace characters (space, tab,
 333   newline, return, formfeed).  If the second argument \var{sep} is
 334   present and not \code{None}, it specifies a string to be used as the
 335   word separator.  The returned list will then have one more item
 336   than the number of non-overlapping occurrences of the separator in
 337   the string.  The optional third argument \var{maxsplit} defaults to
 338   0.  If it is nonzero, at most \var{maxsplit} number of splits occur,
 339   and the remainder of the string is returned as the final element of
 340   the list (thus, the list will have at most \code{\var{maxsplit}+1}
 341   elements).
 342
 343   The behavior of split on an empty string depends on the value of \var{sep}.
 344   If \var{sep} is not specified, or specified as \code{None}, the result will
 345   be an empty list.  If \var{sep} is specified as any string, the result will
 346   be a list containing one element which is an empty string.
 347 \end{funcdesc}
 348
 349 \begin{funcdesc}{rsplit}{s\optional{, sep\optional{, maxsplit}}}
 350   Return a list of the words of the string \var{s}, scanning \var{s}
 351   from the end.  To all intents and purposes, the resulting list of
 352   words is the same as returned by \function{split()}, except when the
 353   optional third argument \var{maxsplit} is explicitly specified and
 354   nonzero.  When \var{maxsplit} is nonzero, at most \var{maxsplit}
 355   number of splits -- the \emph{rightmost} ones -- occur, and the remainder
 356   of the string is returned as the first element of the list (thus, the
 357   list will have at most \code{\var{maxsplit}+1} elements).
 358   \versionadded{2.4}
 359 \end{funcdesc}
 360
 361 \begin{funcdesc}{splitfields}{s\optional{, sep\optional{, maxsplit}}}
 362   This function behaves identically to \function{split()}.  (In the
 363   past, \function{split()} was only used with one argument, while
 364   \function{splitfields()} was only used with two arguments.)
 365 \end{funcdesc}
 366
 367 \begin{funcdesc}{join}{words\optional{, sep}}
 368   Concatenate a list or tuple of words with intervening occurrences of
 369   \var{sep}.  The default value for \var{sep} is a single space
 370   character.  It is always true that
 371   \samp{string.join(string.split(\var{s}, \var{sep}), \var{sep})}
 372   equals \var{s}.
 373 \end{funcdesc}
 374
 375 \begin{funcdesc}{joinfields}{words\optional{, sep}}
 376   This function behaves identically to \function{join()}.  (In the past,
 377   \function{join()} was only used with one argument, while
 378   \function{joinfields()} was only used with two arguments.)
 379   Note that there is no \method{joinfields()} method on string
 380   objects; use the \method{join()} method instead.
 381 \end{funcdesc}
 382
 383 \begin{funcdesc}{lstrip}{s\optional{, chars}}
 384 Return a copy of the string with leading characters removed.  If
 385 \var{chars} is omitted or \code{None}, whitespace characters are
 386 removed.  If given and not \code{None}, \var{chars} must be a string;
 387 the characters in the string will be stripped from the beginning of
 388 the string this method is called on.
 389 \versionchanged[The \var{chars} parameter was added.  The \var{chars}
 390 parameter cannot be passed in earlier 2.2 versions]{2.2.3}
 391 \end{funcdesc}
 392
 393 \begin{funcdesc}{rstrip}{s\optional{, chars}}
 394 Return a copy of the string with trailing characters removed.  If
 395 \var{chars} is omitted or \code{None}, whitespace characters are
 396 removed.  If given and not \code{None}, \var{chars} must be a string;
 397 the characters in the string will be stripped from the end of the
 398 string this method is called on.
 399 \versionchanged[The \var{chars} parameter was added.  The \var{chars}
 400 parameter cannot be passed in earlier 2.2 versions]{2.2.3}
 401 \end{funcdesc}
 402
 403 \begin{funcdesc}{strip}{s\optional{, chars}}
 404 Return a copy of the string with leading and trailing characters
 405 removed.  If \var{chars} is omitted or \code{None}, whitespace
 406 characters are removed.  If given and not \code{None}, \var{chars}
 407 must be a string; the characters in the string will be stripped from
 408 the both ends of the string this method is called on.
 409 \versionchanged[The \var{chars} parameter was added.  The \var{chars}
 410 parameter cannot be passed in earlier 2.2 versions]{2.2.3}
 411 \end{funcdesc}
 412
 413 \begin{funcdesc}{swapcase}{s}
 414   Return a copy of \var{s}, but with lower case letters
 415   converted to upper case and vice versa.
 416 \end{funcdesc}
 417
 418 \begin{funcdesc}{translate}{s, table\optional{, deletechars}}
 419   Delete all characters from \var{s} that are in \var{deletechars} (if
 420   present), and then translate the characters using \var{table}, which
 421   must be a 256-character string giving the translation for each
 422   character value, indexed by its ordinal.
 423 \end{funcdesc}
 424
 425 \begin{funcdesc}{upper}{s}
 426   Return a copy of \var{s}, but with lower case letters converted to
 427   upper case.
 428 \end{funcdesc}
 429
 430 \begin{funcdesc}{ljust}{s, width}
 431 \funcline{rjust}{s, width}
 432 \funcline{center}{s, width}
 433   These functions respectively left-justify, right-justify and center
 434   a string in a field of given width.  They return a string that is at
 435   least \var{width} characters wide, created by padding the string
 436   \var{s} with spaces until the given width on the right, left or both
 437   sides.  The string is never truncated.
 438 \end{funcdesc}
 439
 440 \begin{funcdesc}{zfill}{s, width}
 441   Pad a numeric string on the left with zero digits until the given
 442   width is reached.  Strings starting with a sign are handled
 443   correctly.
 444 \end{funcdesc}
 445
 446 \begin{funcdesc}{replace}{str, old, new\optional{, maxreplace}}
 447   Return a copy of string \var{str} with all occurrences of substring
 448   \var{old} replaced by \var{new}.  If the optional argument
 449   \var{maxreplace} is given, the first \var{maxreplace} occurrences are
 450   replaced.
 451 \end{funcdesc}