gdb/ada-unicode.py

   1 #!/usr/bin/env python3
   2
   3 # Generate Unicode case-folding table for Ada.
   4
   5 # Copyright (C) 2022-2024 Free Software Foundation, Inc.
   6
   7 # This file is part of GDB.
   8
   9 # This program is free software; you can redistribute it and/or modify
  10 # it under the terms of the GNU General Public License as published by
  11 # the Free Software Foundation; either version 3 of the License, or
  12 # (at your option) any later version.
  13
  14 # This program is distributed in the hope that it will be useful,
  15 # but WITHOUT ANY WARRANTY; without even the implied warranty of
  16 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  17 # GNU General Public License for more details.
  18
  19 # You should have received a copy of the GNU General Public License
  20 # along with this program.  If not, see <http://www.gnu.org/licenses/>.
  21
  22 # This generates the ada-casefold.h header.
  23 # Usage:
  24 #   python ada-unicode.py
  25
  26 import gdbcopyright
  27
  28
  29 class Range:
  30     def __init__(self, range_start: int, upper_delta: int, lower_delta: int):
  31         self._range_start = range_start
  32         self._range_end = range_start
  33         self._upper_delta = upper_delta
  34         self._lower_delta = lower_delta
  35
  36     # The start of the range.
  37     @property
  38     def range_start(self):
  39         return self._range_start
  40
  41     # The end of the range.
  42     @property
  43     def range_end(self):
  44         return self._range_end
  45
  46     @range_end.setter
  47     def range_end(self, val: int):
  48         self._range_end = val
  49
  50     # The delta between RANGE_START and the upper-case variant of that
  51     # character.
  52     @property
  53     def upper_delta(self):
  54         return self._upper_delta
  55
  56     # The delta between RANGE_START and the lower-case variant of that
  57     # character.
  58     @property
  59     def lower_delta(self):
  60         return self._lower_delta
  61
  62
  63 # The current range we are processing.  If None,  then we're outside of a range.
  64 current_range: Range | None = None
  65
  66 # All the ranges found and completed so far.
  67 all_ranges: list[Range] = []
  68
  69
  70 def finish_range():
  71     global current_range
  72
  73     if current_range is not None:
  74         all_ranges.append(current_range)
  75         current_range = None
  76
  77
  78 def process_codepoint(val: int):
  79     global current_range
  80
  81     c = chr(val)
  82     low = c.lower()
  83     up = c.upper()
  84     # U+00DF ("LATIN SMALL LETTER SHARP S", aka eszsett) traditionally
  85     # upper-cases to the two-character string "SS" (the capital form
  86     # is a relatively recent addition -- 2017).  Our simple scheme
  87     # can't handle this, so we skip it.  Also, because our approach
  88     # just represents runs of characters with identical folding
  89     # deltas, this change must terminate the current run.
  90     if (c == low and c == up) or len(low) != 1 or len(up) != 1:
  91         finish_range()
  92         return
  93     updelta = ord(up) - val
  94     lowdelta = ord(low) - val
  95
  96     if current_range is not None and (
  97         updelta != current_range.upper_delta or lowdelta != current_range.lower_delta
  98     ):
  99         finish_range()
 100
 101     if current_range is None:
 102         current_range = Range(val, updelta, lowdelta)
 103
 104     current_range.range_end = val
 105
 106
 107 for c in range(0, 0x10FFFF):
 108     process_codepoint(c)
 109
 110 with open("ada-casefold.h", "w") as f:
 111     print(
 112         gdbcopyright.copyright("ada-unicode.py", "UTF-32 case-folding for GDB"),
 113         file=f,
 114     )
 115     print("", file=f)
 116     for r in all_ranges:
 117         print(
 118             f"   {{{r.range_start}, {r.range_end}, {r.upper_delta}, {r.lower_delta}}},",
 119             file=f,
 120         )