hppa: Always enable PIE on 64-bit target
[official-gcc.git] / contrib / unicode / gen_libstdcxx_unicode_data.py
blobf2f2f8a8ec230cb64e58b2ef439cdf7d7c70ed5f
1 #!/usr/bin/env python3
3 # Script to generate tables for libstdc++ std::format width estimation.
5 # This file is part of GCC.
7 # GCC is free software; you can redistribute it and/or modify it under
8 # the terms of the GNU General Public License as published by the Free
9 # Software Foundation; either version 3, or (at your option) any later
10 # version.
12 # GCC is distributed in the hope that it will be useful, but WITHOUT ANY
13 # WARRANTY; without even the implied warranty of MERCHANTABILITY or
14 # FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
15 # for more details.
17 # You should have received a copy of the GNU General Public License
18 # along with GCC; see the file COPYING3. If not see
19 # <http://www.gnu.org/licenses/>.
21 # To update the Libstdc++ static data in <bits/unicode-data.h> download the latest:
22 # ftp://ftp.unicode.org/Public/UNIDATA/EastAsianWidth.txt
23 # ftp://ftp.unicode.org/Public/UNIDATA/DerivedCoreProperties.txt
24 # ftp://ftp.unicode.org/Public/UNIDATA/auxiliary/GraphemeBreakProperty.txt
25 # ftp://ftp.unicode.org/Public/UNIDATA/emoji/emoji-data.txt
26 # Then run this script and save the output to
27 # ../../libstdc++-v3/include/bits/unicode-data.h
29 import sys
30 import re
31 import math
33 print("// Generated by contrib/unicode/gen_std_format_width.py, do not edit.\n")
34 print("#ifndef _GLIBCXX_GET_UNICODE_DATA")
35 print('# error "This is not a public header, do not include it directly"')
36 print("#elif _GLIBCXX_GET_UNICODE_DATA != 150100")
37 print('# error "Version mismatch for Unicode static data"')
38 print("#endif\n")
40 # Process a list and return a list of tuples (index, val) which are the elements
41 # in the list that have a different val from the previous element.
42 # e.g. find_edges([a, a, b, b, c, b, b, d]) is [(0,a), (2,b), (4,c), (5,b), (7,d)]
43 # and find_edges([a, a, b, b, c, b, b, d], a) is [(2,b), (4,c), (5,b), (7,d)]
44 def find_edges(vals, init = None):
45 edges = []
46 prev_val = init
47 for i, v in enumerate(vals):
48 if v != prev_val:
49 edges.append((i,v))
50 prev_val = v
51 return edges
53 all_code_points = []
55 # Process a code point value or range of code point values with given property.
56 def process_code_points(code_points, val):
57 # Example arguments:
58 # 1100..115F, x
59 # 232A, y
61 r = code_points.split("..")
62 if len(r) == 1:
63 c = int(r[0], base=16)
64 all_code_points[c] = val
65 elif len(r) == 2:
66 begin = int(r[0], base=16)
67 end = int(r[1], base=16) + 1
68 all_code_points[begin:end] = [val] * (end - begin)
69 else:
70 raise ValueError
72 # By default every code point has width 1. This is what the C++ standard says,
73 # even though the Unicode standard says some code points have width 0.
74 all_code_points = [1] * (1 + 0x10FFFF)
76 # Extract all code points with East_Asian_Width=W or East_Asian_Width=F
77 for line in open("EastAsianWidth.txt", "r"):
78 # Example lines:
79 # 3000 ; F
80 # 3001..3003 ; W
81 line = line.split("#")[0]
82 if re.match(r'^[\dA-Fa-f][^;]+;\s*[WF]\s*$', line):
83 process_code_points(line.split(";")[0], 2)
85 # The C++ standard also gives width 2 to the following ranges:
86 # U+4DC0 – U+4DFF (Yijing Hexagram Symbols)
87 process_code_points("4DC0..4DFF", 2)
88 # U+1F300 – U+1F5FF (Miscellaneous Symbols and Pictographs)
89 process_code_points("1F300..1F5FF", 2)
90 # U+1F900 – U+1F9FF (Supplemental Symbols and Pictographs)
91 process_code_points("1F900..1F9FF", 2)
93 # Create a list that only contains the code points that have a different width
94 # to the previous code point.
95 edges = find_edges(all_code_points, 1)
97 # Table for std::__unicode::__format_width(char32_t)
99 print(" // Table generated by contrib/unicode/gen_std_format_width.py,")
100 print(" // from EastAsianWidth.txt from the Unicode standard.");
101 print(" inline constexpr char32_t __width_edges[] = {", end="")
102 for i, e in enumerate(edges):
103 if i % 8:
104 print(" ", end="")
105 else:
106 print("\n ", end="")
107 c,_ = e
108 print("{:#x},".format(c), end="")
109 print("\n };\n")
111 # By default every code point has Grapheme_Cluster_Break=Other.
112 all_code_points = ["Other"] * (1 + 0x10FFFF)
114 # Extract Grapheme_Cluster_Break property for all code points.
115 for line in open("GraphemeBreakProperty.txt", "r"):
116 # Example lines:
117 # "0600..0605", "Prepend"
118 # "00AD", "Control"
119 line = line.split("#")[0]
120 if re.match(r'^[\dA-Fa-f][^;]+;', line):
121 code_points, grapheme_property = line.split(";")
122 process_code_points(code_points, grapheme_property.strip())
124 edges = find_edges(all_code_points)
125 gcb_props = {"Other":0}
126 for c, p in edges:
127 if p not in gcb_props:
128 gcb_props[p] = len(gcb_props)
129 shift_bits = int(math.ceil(math.log2(len(gcb_props))))
131 # Enum definition for std::__unicode::_Gcb_property
133 print(" enum class _Gcb_property {")
134 for p in gcb_props.items():
135 print(" _Gcb_{} = {},".format(p[0],p[1]))
136 print(" };\n")
138 # Tables for std::__unicode::_Grapheme_cluster_state
140 print(" // Values generated by contrib/unicode/gen_std_format_width.py,")
141 print(" // from GraphemeBreakProperty.txt from the Unicode standard.");
142 print(" // Entries are (code_point << shift_bits) + property.")
143 print(" inline constexpr int __gcb_shift_bits = {:#x};".format(shift_bits))
144 print(" inline constexpr uint32_t __gcb_edges[] = {", end="")
145 for i, e in enumerate(edges):
146 if i % 6:
147 print(" ", end="")
148 else:
149 print("\n ", end="")
150 c, p = e
151 x = (c << shift_bits) + gcb_props[p]
152 print("{0:#x},".format(x), end="")
153 print("\n };\n")
155 # By default every code point has Indic_Conjunct_Break=None.
156 all_code_points = [None] * (1 + 0x10FFFF)
158 # Extract Indic_Conjunct_Break property for all code points.
159 for line in open("DerivedCoreProperties.txt", "r"):
160 # Example lines:
161 # 094D ; InCB; Linker
162 # 0B71 ; InCB; Consonant
163 # 0300..034E ; InCB; Extend
164 line = line.split("#")[0]
165 if re.match(r'^[\dA-Fa-f][^;]+; InCB;', line):
166 code_points, _, incb_property = line.split(";")
167 process_code_points(code_points, incb_property.strip())
169 # Table for std::__unicode::__is_incb_linker
170 # This table is tiny, so just contains the list of code points.
171 print(" inline constexpr char32_t __incb_linkers[] = {\n ", end="")
172 for i in [i for i,p in enumerate(all_code_points) if p == "Linker"]:
173 print(" 0x{:04x},".format(i), end="")
174 all_code_points[i] = None
175 print("\n };\n")
177 edges = find_edges(all_code_points)
179 incb_props = {None:0, "Consonant":1, "Extend":2}
180 print(" enum class _InCB { _Consonant = 1, _Extend = 2 };\n")
181 # Table for std::__unicode::__incb_property
182 print(" // Values generated by contrib/unicode/gen_std_format_width.py,")
183 print(" // from DerivedCoreProperties.txt from the Unicode standard.");
184 print(" // Entries are (code_point << 2) + property.")
185 print(" inline constexpr uint32_t __incb_edges[] = {", end="")
186 for i, e in enumerate(edges):
187 if i % 6:
188 print(" ", end="")
189 else:
190 print("\n ", end="")
191 c, p = e
192 x = (c << 2) + incb_props[p]
193 print("{0:#x},".format(x), end="")
194 print("\n };\n")
196 # By default every code point has Emoji=No.
197 all_code_points = [False] * (1 + 0x10FFFF)
199 # Extract Emoji=Extended_Pictographic for all code points.
200 for line in open("emoji-data.txt", "r"):
201 # Example lines:
202 # 1100..115F ; Extended_Pictographic
203 # 232A ; Extended_Pictographic
204 line = line.split("#")[0]
205 if re.match(r'^[\dA-Fa-f][^;]+; Extended_Pictographic', line):
206 process_code_points(line.split(";")[0], True)
208 edges = find_edges(all_code_points, False)
210 # Table for std::__unicode::__is_extended_pictographic
211 print(" // Table generated by contrib/unicode/gen_std_format_width.py,")
212 print(" // from emoji-data.txt from the Unicode standard.");
213 print(" inline constexpr char32_t __xpicto_edges[] = {", end="")
214 for i, e in enumerate(edges):
215 if i % 8:
216 print(" ", end="")
217 else:
218 print("\n ", end="")
219 c,_ = e
220 print("{:#x},".format(c), end="")
221 print("\n };\n")
223 # <bits/unicode.h> gives an error if this macro is left defined.
224 # Do this last, so that the generated output is not usable unless we reach here.
225 print("#undef _GLIBCXX_GET_UNICODE_DATA")