libstdc++-v3/scripts/gen_text_encoding_data.py

   1 #!/usr/bin/env python3
   2 #
   3 # Script to generate tables for libstdc++ std::text_encoding.
   4 #
   5 # This file is part of GCC.
   6 #
   7 # GCC is free software; you can redistribute it and/or modify it under
   8 # the terms of the GNU General Public License as published by the Free
   9 # Software Foundation; either version 3, or (at your option) any later
  10 # version.
  11 #
  12 # GCC is distributed in the hope that it will be useful, but WITHOUT ANY
  13 # WARRANTY; without even the implied warranty of MERCHANTABILITY or
  14 # FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  15 # for more details.
  16 #
  17 # You should have received a copy of the GNU General Public License
  18 # along with GCC; see the file COPYING3.  If not see
  19 # <http://www.gnu.org/licenses/>.
  20
  21 # To update the Libstdc++ static data in <bits/text_encoding-data.h> download
  22 # the latest:
  23 # https://www.iana.org/assignments/character-sets/character-sets-1.csv
  24 # Then run this script and save the output to
  25 # include/bits/text_encoding-data.h
  26
  27 import sys
  28 import csv
  29
  30 if len(sys.argv) != 2:
  31     print("Usage: %s <character sets csv>" % sys.argv[0], file=sys.stderr)
  32     sys.exit(1)
  33
  34 print("""// Generated by gen_text_encoding_data.py, do not edit.
  35
  36 // Copyright The GNU Toolchain Authors.
  37 //
  38 // This file is part of the GNU ISO C++ Library.  This library is free
  39 // software; you can redistribute it and/or modify it under the
  40 // terms of the GNU General Public License as published by the
  41 // Free Software Foundation; either version 3, or (at your option)
  42 // any later version.
  43
  44 // This library is distributed in the hope that it will be useful,
  45 // but WITHOUT ANY WARRANTY; without even the implied warranty of
  46 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  47 // GNU General Public License for more details.
  48
  49 // Under Section 7 of GPL version 3, you are granted additional
  50 // permissions described in the GCC Runtime Library Exception, version
  51 // 3.1, as published by the Free Software Foundation.
  52
  53 // You should have received a copy of the GNU General Public License and
  54 // a copy of the GCC Runtime Library Exception along with this program;
  55 // see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
  56 // <http://www.gnu.org/licenses/>.
  57
  58 /** @file bits/text_encoding-data.h
  59  *  This is an internal header file, included by other library headers.
  60  *  Do not attempt to use it directly. @headername{text_encoding}
  61  */
  62 """)
  63 print("#ifndef _GLIBCXX_GET_ENCODING_DATA")
  64 print('# error "This is not a public header, do not include it directly"')
  65 print("#endif\n")
  66
  67 # We need to generate a list of initializers of the form { mib, alias }, e.g.,
  68 # { 3, "US-ASCII" },
  69 # { 3, "ISO646-US" },
  70 # { 3, "csASCII" },
  71 # { 4, "ISO_8859-1:1987" },
  72 # { 4, "latin1" },
  73 # The initializers must be sorted by the mib value. The first entry for
  74 # a given mib must be the primary name for the encoding. Any aliases for
  75 # the encoding come after the primary name.
  76 # We also define a macro _GLIBCXX_TEXT_ENCODING_UTF8_OFFSET which is the
  77 # offset into the list of the mib=106, alias="UTF-8" entry. This is used
  78 # to optimize the common case, so we don't need to search for "UTF-8".
  79
  80 charsets = {}
  81 with open(sys.argv[1], newline='') as f:
  82     reader = csv.reader(f)
  83     next(reader) # skip header row
  84     for row in reader:
  85         mib = int(row[2])
  86         if mib in charsets:
  87             raise ValueError("Multiple rows for mibEnum={}".format(mib))
  88         name = row[1]
  89         aliases = row[5].split()
  90         # Ensure primary name comes first
  91         if name in aliases:
  92             aliases.remove(name)
  93         charsets[mib] = [name] + aliases
  94
  95 # Remove "NATS-DANO" and "NATS-DANO-ADD" as specified by the C++ standard.
  96 charsets.pop(33, None)
  97 charsets.pop(34, None)
  98
  99 # This is not an official IANA alias, but we include it in the
 100 # implementation-defined superset of aliases for US-ASCII.
 101 # See also LWG 4043.
 102 extra_aliases = {3: ["ASCII"]}
 103
 104 count = 0
 105 for mib in sorted(charsets.keys()):
 106     names = charsets[mib]
 107     if names[0] == "UTF-8":
 108         print("#define _GLIBCXX_TEXT_ENCODING_UTF8_OFFSET {}".format(count))
 109     for name in names:
 110         print('  {{ {:4}, "{}" }},'.format(mib, name))
 111     count += len(names)
 112     if mib in extra_aliases:
 113         names = extra_aliases[mib]
 114         for name in names:
 115             print('  {{ {:4}, "{}" }}, // libstdc++ extension'.format(mib, name))
 116         count += len(names)
 117
 118 # <text_encoding> gives an error if this macro is left defined.
 119 # Do this last, so that the generated output is not usable unless we reach here.
 120 print("\n#undef _GLIBCXX_GET_ENCODING_DATA")