contrib/unicode/gen_libstdcxx_unicode_data.py

   1 #!/usr/bin/env python3
   2 #
   3 # Script to generate tables for libstdc++ std::format width estimation.
   4 #
   5 # This file is part of GCC.
   6 #
   7 # GCC is free software; you can redistribute it and/or modify it under
   8 # the terms of the GNU General Public License as published by the Free
   9 # Software Foundation; either version 3, or (at your option) any later
  10 # version.
  11 #
  12 # GCC is distributed in the hope that it will be useful, but WITHOUT ANY
  13 # WARRANTY; without even the implied warranty of MERCHANTABILITY or
  14 # FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  15 # for more details.
  16 #
  17 # You should have received a copy of the GNU General Public License
  18 # along with GCC; see the file COPYING3.  If not see
  19 # <http://www.gnu.org/licenses/>.
  20
  21 # To update the Libstdc++ static data in <bits/unicode-data.h> download the latest:
  22 # ftp://ftp.unicode.org/Public/UNIDATA/EastAsianWidth.txt
  23 # ftp://ftp.unicode.org/Public/UNIDATA/DerivedCoreProperties.txt
  24 # ftp://ftp.unicode.org/Public/UNIDATA/auxiliary/GraphemeBreakProperty.txt
  25 # ftp://ftp.unicode.org/Public/UNIDATA/emoji/emoji-data.txt
  26 # Then run this script and save the output to
  27 # ../../libstdc++-v3/include/bits/unicode-data.h
  28
  29 import sys
  30 import re
  31 import math
  32
  33 print("// Generated by contrib/unicode/gen_std_format_width.py, do not edit.\n")
  34 print("#ifndef _GLIBCXX_GET_UNICODE_DATA")
  35 print('# error "This is not a public header, do not include it directly"')
  36 print("#elif _GLIBCXX_GET_UNICODE_DATA != 150100")
  37 print('# error "Version mismatch for Unicode static data"')
  38 print("#endif\n")
  39
  40 # Process a list and return a list of tuples (index, val) which are the elements
  41 # in the list that have a different val from the previous element.
  42 # e.g. find_edges([a, a, b, b, c, b, b, d]) is [(0,a), (2,b), (4,c), (5,b), (7,d)]
  43 # and find_edges([a, a, b, b, c, b, b, d], a) is [(2,b), (4,c), (5,b), (7,d)]
  44 def find_edges(vals, init = None):
  45     edges = []
  46     prev_val = init
  47     for i, v in enumerate(vals):
  48         if v != prev_val:
  49             edges.append((i,v))
  50             prev_val = v
  51     return edges
  52
  53 all_code_points = []
  54
  55 # Process a code point value or range of code point values with given property.
  56 def process_code_points(code_points, val):
  57     # Example arguments:
  58     # 1100..115F, x
  59     # 232A, y
  60
  61     r = code_points.split("..")
  62     if len(r) == 1:
  63         c = int(r[0], base=16)
  64         all_code_points[c] = val
  65     elif len(r) == 2:
  66         begin = int(r[0], base=16)
  67         end = int(r[1], base=16) + 1
  68         all_code_points[begin:end] = [val] * (end - begin)
  69     else:
  70         raise ValueError
  71
  72 # By default every code point has width 1. This is what the C++ standard says,
  73 # even though the Unicode standard says some code points have width 0.
  74 all_code_points = [1] * (1 + 0x10FFFF)
  75
  76 # Extract all code points with East_Asian_Width=W or East_Asian_Width=F
  77 for line in open("EastAsianWidth.txt", "r"):
  78     # Example lines:
  79     # 3000           ; F
  80     # 3001..3003     ; W
  81     line = line.split("#")[0]
  82     if re.match(r'^[\dA-Fa-f][^;]+;\s*[WF]\s*$', line):
  83         process_code_points(line.split(";")[0], 2)
  84
  85 # The C++ standard also gives width 2 to the following ranges:
  86 # U+4DC0 – U+4DFF (Yijing Hexagram Symbols)
  87 process_code_points("4DC0..4DFF", 2)
  88 # U+1F300 – U+1F5FF (Miscellaneous Symbols and Pictographs)
  89 process_code_points("1F300..1F5FF", 2)
  90 # U+1F900 – U+1F9FF (Supplemental Symbols and Pictographs)
  91 process_code_points("1F900..1F9FF", 2)
  92
  93 # Create a list that only contains the code points that have a different width
  94 # to the previous code point.
  95 edges = find_edges(all_code_points, 1)
  96
  97 # Table for std::__unicode::__format_width(char32_t)
  98
  99 print("  // Table generated by contrib/unicode/gen_std_format_width.py,")
 100 print("  // from EastAsianWidth.txt from the Unicode standard.");
 101 print("  inline constexpr char32_t __width_edges[] = {", end="")
 102 for i, e in enumerate(edges):
 103     if i % 8:
 104         print(" ", end="")
 105     else:
 106         print("\n    ", end="")
 107     c,_ = e
 108     print("{:#x},".format(c), end="")
 109 print("\n  };\n")
 110
 111 # By default every code point has Grapheme_Cluster_Break=Other.
 112 all_code_points = ["Other"] * (1 + 0x10FFFF)
 113
 114 # Extract Grapheme_Cluster_Break property for all code points.
 115 for line in open("GraphemeBreakProperty.txt", "r"):
 116     # Example lines:
 117     # "0600..0605", "Prepend"
 118     # "00AD", "Control"
 119     line = line.split("#")[0]
 120     if re.match(r'^[\dA-Fa-f][^;]+;', line):
 121         code_points, grapheme_property = line.split(";")
 122         process_code_points(code_points, grapheme_property.strip())
 123
 124 edges = find_edges(all_code_points)
 125 gcb_props = {"Other":0}
 126 for c, p in edges:
 127     if p not in gcb_props:
 128         gcb_props[p] = len(gcb_props)
 129 shift_bits = int(math.ceil(math.log2(len(gcb_props))))
 130
 131 # Enum definition for std::__unicode::_Gcb_property
 132
 133 print("  enum class _Gcb_property {")
 134 for p in gcb_props.items():
 135     print("    _Gcb_{} = {},".format(p[0],p[1]))
 136 print("  };\n")
 137
 138 # Tables for std::__unicode::_Grapheme_cluster_state
 139
 140 print("  // Values generated by contrib/unicode/gen_std_format_width.py,")
 141 print("  // from GraphemeBreakProperty.txt from the Unicode standard.");
 142 print("  // Entries are (code_point << shift_bits) + property.")
 143 print("  inline constexpr int __gcb_shift_bits = {:#x};".format(shift_bits))
 144 print("  inline constexpr uint32_t __gcb_edges[] = {", end="")
 145 for i, e in enumerate(edges):
 146     if i % 6:
 147         print(" ", end="")
 148     else:
 149         print("\n    ", end="")
 150     c, p = e
 151     x = (c << shift_bits) + gcb_props[p]
 152     print("{0:#x},".format(x), end="")
 153 print("\n  };\n")
 154
 155 # By default every code point has Indic_Conjunct_Break=None.
 156 all_code_points = [None] * (1 + 0x10FFFF)
 157
 158 # Extract Indic_Conjunct_Break property for all code points.
 159 for line in open("DerivedCoreProperties.txt", "r"):
 160     # Example lines:
 161     # 094D       ; InCB; Linker
 162     # 0B71       ; InCB; Consonant
 163     # 0300..034E ; InCB; Extend
 164     line = line.split("#")[0]
 165     if re.match(r'^[\dA-Fa-f][^;]+; InCB;', line):
 166         code_points, _, incb_property = line.split(";")
 167         process_code_points(code_points, incb_property.strip())
 168
 169 # Table for std::__unicode::__is_incb_linker
 170 # This table is tiny, so just contains the list of code points.
 171 print("  inline constexpr char32_t __incb_linkers[] = {\n   ", end="")
 172 for i in [i for i,p in enumerate(all_code_points) if p == "Linker"]:
 173     print(" 0x{:04x},".format(i), end="")
 174     all_code_points[i] = None
 175 print("\n  };\n")
 176
 177 edges = find_edges(all_code_points)
 178
 179 incb_props = {None:0, "Consonant":1, "Extend":2}
 180 print("  enum class _InCB { _Consonant = 1, _Extend = 2 };\n")
 181 # Table for std::__unicode::__incb_property
 182 print("  // Values generated by contrib/unicode/gen_std_format_width.py,")
 183 print("  // from DerivedCoreProperties.txt from the Unicode standard.");
 184 print("  // Entries are (code_point << 2) + property.")
 185 print("  inline constexpr uint32_t __incb_edges[] = {", end="")
 186 for i, e in enumerate(edges):
 187     if i % 6:
 188         print(" ", end="")
 189     else:
 190         print("\n    ", end="")
 191     c, p = e
 192     x = (c << 2) + incb_props[p]
 193     print("{0:#x},".format(x), end="")
 194 print("\n  };\n")
 195
 196 # By default every code point has Emoji=No.
 197 all_code_points = [False] * (1 + 0x10FFFF)
 198
 199 # Extract Emoji=Extended_Pictographic for all code points.
 200 for line in open("emoji-data.txt", "r"):
 201     # Example lines:
 202     # 1100..115F ; Extended_Pictographic
 203     # 232A       ; Extended_Pictographic
 204     line = line.split("#")[0]
 205     if re.match(r'^[\dA-Fa-f][^;]+; Extended_Pictographic', line):
 206         process_code_points(line.split(";")[0], True)
 207
 208 edges = find_edges(all_code_points, False)
 209
 210 # Table for std::__unicode::__is_extended_pictographic
 211 print("  // Table generated by contrib/unicode/gen_std_format_width.py,")
 212 print("  // from emoji-data.txt from the Unicode standard.");
 213 print("  inline constexpr char32_t __xpicto_edges[] = {", end="")
 214 for i, e in enumerate(edges):
 215     if i % 8:
 216         print(" ", end="")
 217     else:
 218         print("\n    ", end="")
 219     c,_ = e
 220     print("{:#x},".format(c), end="")
 221 print("\n  };\n")
 222
 223 # <bits/unicode.h> gives an error if this macro is left defined.
 224 # Do this last, so that the generated output is not usable unless we reach here.
 225 print("#undef _GLIBCXX_GET_UNICODE_DATA")