Doc/lib/libunicodedata.tex

   1 \section{\module{unicodedata} ---
   2          Unicode Database}
   3
   4 \declaremodule{standard}{unicodedata}
   5 \modulesynopsis{Access the Unicode Database.}
   6 \moduleauthor{Marc-Andre Lemburg}{mal@lemburg.com}
   7 \sectionauthor{Marc-Andre Lemburg}{mal@lemburg.com}
   8 \sectionauthor{Martin v. L\"owis}{martin@v.loewis.de}
   9
  10 \index{Unicode}
  11 \index{character}
  12 \indexii{Unicode}{database}
  13
  14 This module provides access to the Unicode Character Database which
  15 defines character properties for all Unicode characters. The data in
  16 this database is based on the \file{UnicodeData.txt} file version
  17 3.2.0 which is publically available from \url{ftp://ftp.unicode.org/}.
  18
  19 The module uses the same names and symbols as defined by the
  20 UnicodeData File Format 3.2.0 (see
  21 \url{http://www.unicode.org/Public/3.2-Update/UnicodeData-3.2.0.html}).  It
  22 defines the following functions:
  23
  24 \begin{funcdesc}{lookup}{name}
  25   Look up character by name.  If a character with the
  26   given name is found, return the corresponding Unicode
  27   character.  If not found, \exception{KeyError} is raised.
  28 \end{funcdesc}
  29
  30 \begin{funcdesc}{name}{unichr\optional{, default}}
  31   Returns the name assigned to the Unicode character
  32   \var{unichr} as a string. If no name is defined,
  33   \var{default} is returned, or, if not given,
  34   \exception{ValueError} is raised.
  35 \end{funcdesc}
  36
  37 \begin{funcdesc}{decimal}{unichr\optional{, default}}
  38   Returns the decimal value assigned to the Unicode character
  39   \var{unichr} as integer. If no such value is defined,
  40   \var{default} is returned, or, if not given,
  41   \exception{ValueError} is raised.
  42 \end{funcdesc}
  43
  44 \begin{funcdesc}{digit}{unichr\optional{, default}}
  45   Returns the digit value assigned to the Unicode character
  46   \var{unichr} as integer. If no such value is defined,
  47   \var{default} is returned, or, if not given,
  48   \exception{ValueError} is raised.
  49 \end{funcdesc}
  50
  51 \begin{funcdesc}{numeric}{unichr\optional{, default}}
  52   Returns the numeric value assigned to the Unicode character
  53   \var{unichr} as float. If no such value is defined, \var{default} is
  54   returned, or, if not given, \exception{ValueError} is raised.
  55 \end{funcdesc}
  56
  57 \begin{funcdesc}{category}{unichr}
  58   Returns the general category assigned to the Unicode character
  59   \var{unichr} as string.
  60 \end{funcdesc}
  61
  62 \begin{funcdesc}{bidirectional}{unichr}
  63   Returns the bidirectional category assigned to the Unicode character
  64   \var{unichr} as string. If no such value is defined, an empty string
  65   is returned.
  66 \end{funcdesc}
  67
  68 \begin{funcdesc}{combining}{unichr}
  69   Returns the canonical combining class assigned to the Unicode
  70   character \var{unichr} as integer. Returns \code{0} if no combining
  71   class is defined.
  72 \end{funcdesc}
  73
  74 \begin{funcdesc}{east_asian_width}{unichr}
  75   Returns the east asian width assigned to the Unicode character
  76   \var{unichr} as string.
  77 \versionadded{2.4}
  78 \end{funcdesc}
  79
  80 \begin{funcdesc}{mirrored}{unichr}
  81   Returns the mirrored property assigned to the Unicode character
  82   \var{unichr} as integer. Returns \code{1} if the character has been
  83   identified as a ``mirrored'' character in bidirectional text,
  84   \code{0} otherwise.
  85 \end{funcdesc}
  86
  87 \begin{funcdesc}{decomposition}{unichr}
  88   Returns the character decomposition mapping assigned to the Unicode
  89   character \var{unichr} as string. An empty string is returned in case
  90   no such mapping is defined.
  91 \end{funcdesc}
  92
  93 \begin{funcdesc}{normalize}{form, unistr}
  94
  95 Return the normal form \var{form} for the Unicode string \var{unistr}.
  96 Valid values for \var{form} are 'NFC', 'NFKC', 'NFD', and 'NFKD'.
  97
  98 The Unicode standard defines various normalization forms of a Unicode
  99 string, based on the definition of canonical equivalence and
 100 compatibility equivalence. In Unicode, several characters can be
 101 expressed in various way. For example, the character U+00C7 (LATIN
 102 CAPITAL LETTER C WITH CEDILLA) can also be expressed as the sequence
 103 U+0043 (LATIN CAPITAL LETTER C) U+0327 (COMBINING CEDILLA).
 104
 105 For each character, there are two normal forms: normal form C and
 106 normal form D. Normal form D (NFD) is also known as canonical
 107 decomposition, and translates each character into its decomposed form.
 108 Normal form C (NFC) first applies a canonical decomposition, then
 109 composes pre-combined characters again.
 110
 111 In addition to these two forms, there two additional normal forms
 112 based on compatibility equivalence. In Unicode, certain characters are
 113 supported which normally would be unified with other characters. For
 114 example, U+2160 (ROMAN NUMERAL ONE) is really the same thing as U+0049
 115 (LATIN CAPITAL LETTER I). However, it is supported in Unicode for
 116 compatibility with existing character sets (e.g. gb2312).
 117
 118 The normal form KD (NFKD) will apply the compatibility decomposition,
 119 i.e. replace all compatibility characters with their equivalents. The
 120 normal form KC (NFKC) first applies the compatibility decomposition,
 121 followed by the canonical composition.
 122
 123 \versionadded{2.3}
 124 \end{funcdesc}
 125
 126 In addition, the module exposes the following constant:
 127
 128 \begin{datadesc}{unidata_version}
 129 The version of the Unicode database used in this module.
 130
 131 \versionadded{2.3}
 132 \end{datadesc}