unistr/u{8,16,32}-uctomb: Avoid possible trouble with huge strings.
[gnulib.git] / lib / uniname / gen-uninames.lisp
blob38a798f09cf31809a17df419b732232fe053c658
1 #!/usr/local/bin/clisp -C
3 ;;; Creation of gnulib's uninames.h from the UnicodeData.txt table.
5 ;;; Copyright (C) 2000-2020 Free Software Foundation, Inc.
6 ;;; Written by Bruno Haible <bruno@clisp.org>, 2000-12-28.
7 ;;;
8 ;;; This program is free software: you can redistribute it and/or modify
9 ;;; it under the terms of the GNU General Public License as published by
10 ;;; the Free Software Foundation; either version 3 of the License, or
11 ;;; (at your option) any later version.
12 ;;;
13 ;;; This program is distributed in the hope that it will be useful,
14 ;;; but WITHOUT ANY WARRANTY; without even the implied warranty of
15 ;;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 ;;; GNU General Public License for more details.
17 ;;;
18 ;;; You should have received a copy of the GNU General Public License
19 ;;; along with this program. If not, see <https://www.gnu.org/licenses/>.
21 (defparameter add-comments nil)
23 (defstruct unicode-char
24 (index nil :type integer)
25 (name nil :type string)
26 word-indices
27 word-indices-index
30 (defstruct range
31 (index nil :type integer)
32 (start-code nil :type integer)
33 (end-code nil :type integer)
36 (defstruct word-list
37 (hashed nil :type hash-table)
38 (sorted nil :type list)
39 size ; number of characters total
40 length ; number of words
43 (defun main (inputfile outputfile aliasfile)
44 (declare (type string inputfile outputfile aliasfile))
45 #+UNICODE (setq *default-file-encoding* charset:utf-8)
46 (let ((all-chars '())
47 (all-chars-hashed (make-hash-table :test #'equal))
48 (all-aliases '())
49 all-chars-and-aliases
50 (all-ranges '())
51 (name-index 0)
52 range)
53 ;; Read all characters and names from the input file.
54 (with-open-file (istream inputfile :direction :input)
55 (loop
56 (let ((line (read-line istream nil nil)))
57 (unless line (return))
58 (let* ((i1 (position #\; line))
59 (i2 (position #\; line :start (1+ i1)))
60 (code-string (subseq line 0 i1))
61 (code (parse-integer code-string :radix 16))
62 (name-string (subseq line (1+ i1) i2)))
63 ; Ignore characters whose name starts with "<".
64 (unless (eql (char name-string 0) #\<)
65 ; Also ignore Hangul syllables; they are treated specially.
66 (unless (<= #xAC00 code #xD7A3)
67 ; Also ignore CJK compatibility ideographs; they are treated
68 ; specially as well.
69 (unless (or (<= #xF900 code #xFA2D) (<= #xFA30 code #xFA6A)
70 (<= #xFA70 code #xFAD9) (<= #x2F800 code #x2FA1D))
71 ;; Also ignore variationselectors; they are treated
72 ;; specially as well.
73 (unless (or (<= #xFE00 code #xFE0F) (<= #xE0100 code #xE01EF))
74 (push (make-unicode-char :index name-index
75 :name name-string)
76 all-chars)
77 (setf (gethash code all-chars-hashed) (car all-chars))
78 ;; Update the contiguous range, or start a new range.
79 (if (and range (= (1+ (range-end-code range)) code))
80 (setf (range-end-code range) code)
81 (progn
82 (when range
83 (push range all-ranges))
84 (setq range (make-range :index name-index
85 :start-code code
86 :end-code code))))
87 (incf name-index)
88 (setq last-code code)
89 ) ) ) )
90 ) ) ) )
91 (setq all-chars (nreverse all-chars))
92 (if range
93 (push range all-ranges))
94 (setq all-ranges (nreverse all-ranges))
95 (when aliasfile
96 ;; Read all characters and names from the alias file.
97 (with-open-file (istream aliasfile :direction :input)
98 (loop
99 (let ((line (read-line istream nil nil)))
100 (unless line (return))
101 (let* ((i1 (position #\; line))
102 (i2 (position #\; line :start (1+ i1)))
103 (code-string (subseq line 0 i1))
104 (code (parse-integer code-string :radix 16))
105 (name-string (subseq line (1+ i1) i2))
106 (uc (gethash code all-chars-hashed)))
107 (when uc
108 (push (make-unicode-char :index (unicode-char-index uc)
109 :name name-string)
110 all-aliases)
111 ) ) ) ) ) )
112 (setq all-aliases (nreverse all-aliases)
113 all-chars-and-aliases (append all-chars all-aliases))
114 ;; Split into words.
115 (let ((words-by-length (make-array 0 :adjustable t)))
116 (dolist (name (list* "HANGUL SYLLABLE" "CJK COMPATIBILITY" "VARIATION"
117 (mapcar #'unicode-char-name all-chars-and-aliases)))
118 (let ((i1 0))
119 (loop
120 (when (>= i1 (length name)) (return))
121 (let ((i2 (or (position #\Space name :start i1) (length name))))
122 (let* ((word (subseq name i1 i2))
123 (len (length word)))
124 (when (>= len (length words-by-length))
125 (adjust-array words-by-length (1+ len))
127 (unless (aref words-by-length len)
128 (setf (aref words-by-length len)
129 (make-word-list
130 :hashed (make-hash-table :test #'equal)
131 :sorted '()
132 ) ) )
133 (let ((word-list (aref words-by-length len)))
134 (unless (gethash word (word-list-hashed word-list))
135 (setf (gethash word (word-list-hashed word-list)) t)
136 (push word (word-list-sorted word-list))
139 (setq i1 (1+ i2))
140 ) ) ) )
141 ;; Sort the word lists.
142 (dotimes (len (length words-by-length))
143 (unless (aref words-by-length len)
144 (setf (aref words-by-length len)
145 (make-word-list
146 :hashed (make-hash-table :test #'equal)
147 :sorted '()
148 ) ) )
149 (let ((word-list (aref words-by-length len)))
150 (setf (word-list-sorted word-list)
151 (sort (word-list-sorted word-list) #'string<)
153 (setf (word-list-size word-list)
154 (reduce #'+ (mapcar #'length (word-list-sorted word-list)))
156 (setf (word-list-length word-list)
157 (length (word-list-sorted word-list))
158 ) ) )
159 ;; Output the tables.
160 (with-open-file (ostream outputfile :direction :output
161 #+UNICODE :external-format #+UNICODE charset:ascii)
162 (format ostream "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */~%")
163 (format ostream "/*~%")
164 (format ostream " * ~A~%" (file-namestring outputfile))
165 (format ostream " *~%")
166 (format ostream " * Unicode character name table.~%")
167 (format ostream " * Generated automatically by the gen-uninames utility.~%")
168 (format ostream " */~%")
169 (format ostream "~%")
170 (format ostream "static const char unicode_name_words[~D] = {~%"
171 (let ((sum 0))
172 (dotimes (len (length words-by-length))
173 (let ((word-list (aref words-by-length len)))
174 (incf sum (word-list-size word-list))
178 (dotimes (len (length words-by-length))
179 (let ((word-list (aref words-by-length len)))
180 (dolist (word (word-list-sorted word-list))
181 (format ostream " ~{ '~C',~}~%" (coerce word 'list))
182 ) ) )
183 (format ostream "};~%")
184 (format ostream "#define UNICODE_CHARNAME_NUM_WORDS ~D~%"
185 (let ((sum 0))
186 (dotimes (len (length words-by-length))
187 (let ((word-list (aref words-by-length len)))
188 (incf sum (word-list-length word-list))
192 #| ; Redundant data
193 (format ostream "static const uint16_t unicode_name_word_offsets[~D] = {~%"
194 (let ((sum 0))
195 (dotimes (len (length words-by-length))
196 (let ((word-list (aref words-by-length len)))
197 (incf sum (word-list-length word-list))
201 (dotimes (len (length words-by-length))
202 (let ((word-list (aref words-by-length len)))
203 (when (word-list-sorted word-list)
204 (format ostream " ")
205 (do ((l (word-list-sorted word-list) (cdr l))
206 (offset 0 (+ offset (length (car l)))))
207 ((endp l))
208 (format ostream "~<~% ~0,79:; ~D,~>" offset)
210 (format ostream "~%")
211 ) ) )
212 (format ostream "};~%")
214 (format ostream "static const struct { uint32_t extra_offset; uint16_t ind_offset; } unicode_name_by_length[~D] = {~%"
215 (1+ (length words-by-length))
217 (let ((extra-offset 0)
218 (ind-offset 0))
219 (dotimes (len (length words-by-length))
220 (let ((word-list (aref words-by-length len)))
221 (format ostream " { ~D, ~D },~%" extra-offset ind-offset)
222 (incf extra-offset (word-list-size word-list))
223 (incf ind-offset (word-list-length word-list))
225 (format ostream " { ~D, ~D }~%" extra-offset ind-offset)
227 (format ostream "};~%")
228 (let ((ind-offset 0))
229 (dotimes (len (length words-by-length))
230 (let ((word-list (aref words-by-length len)))
231 (dolist (word (word-list-sorted word-list))
232 (setf (gethash word (word-list-hashed word-list)) ind-offset)
233 (incf ind-offset)
234 ) ) ) )
235 (dolist (word '("HANGUL" "SYLLABLE" "CJK" "COMPATIBILITY" "VARIATION"))
236 (format ostream "#define UNICODE_CHARNAME_WORD_~A ~D~%" word
237 (gethash word (word-list-hashed (aref words-by-length (length word))))
239 ;; Compute the word-indices for every unicode-char.
240 (dolist (uc all-chars-and-aliases)
241 (let ((name (unicode-char-name uc))
242 (indices '()))
243 (let ((i1 0))
244 (loop
245 (when (>= i1 (length name)) (return))
246 (let ((i2 (or (position #\Space name :start i1) (length name))))
247 (let* ((word (subseq name i1 i2))
248 (len (length word)))
249 (push (gethash word (word-list-hashed (aref words-by-length len)))
250 indices
253 (setq i1 (1+ i2))
254 ) ) )
255 (setf (unicode-char-word-indices uc)
256 (coerce (nreverse indices) 'vector)
259 ;; Sort the list of unicode-chars by word-indices.
260 (setq all-chars-and-aliases
261 (sort all-chars-and-aliases
262 (lambda (vec1 vec2)
263 (let ((len1 (length vec1))
264 (len2 (length vec2)))
265 (do ((i 0 (1+ i)))
266 (nil)
267 (if (< i len2)
268 (if (< i len1)
269 (cond ((< (aref vec1 i) (aref vec2 i)) (return t))
270 ((> (aref vec1 i) (aref vec2 i)) (return nil))
272 (return t)
274 (return nil)
275 ) ) ) )
276 :key #'unicode-char-word-indices
278 ;; Output the word-indices.
279 (format ostream "static const uint16_t unicode_names[~D] = {~%"
280 (reduce #'+ (mapcar (lambda (uc) (length (unicode-char-word-indices uc))) all-chars-and-aliases))
282 (let ((i 0))
283 (dolist (uc all-chars-and-aliases)
284 (format ostream " ~{ ~D,~}"
285 (maplist (lambda (r) (+ (* 2 (car r)) (if (cdr r) 1 0)))
286 (coerce (unicode-char-word-indices uc) 'list)
289 (when add-comments
290 (format ostream "~40T/* ~A */" (unicode-char-name uc))
292 (format ostream "~%")
293 (setf (unicode-char-word-indices-index uc) i)
294 (incf i (length (unicode-char-word-indices uc)))
296 (format ostream "};~%")
297 (format ostream "static const struct { uint16_t index; uint32_t name:24; } ATTRIBUTE_PACKED unicode_name_to_index[~D] = {~%"
298 (length all-chars-and-aliases)
300 (dolist (uc all-chars-and-aliases)
301 (format ostream " { 0x~4,'0X, ~D },"
302 (unicode-char-index uc)
303 (unicode-char-word-indices-index uc)
305 (when add-comments
306 (format ostream "~21T/* ~A */" (unicode-char-name uc))
308 (format ostream "~%")
310 (format ostream "};~%")
311 (format ostream "static const struct { uint16_t index; uint32_t name:24; } ATTRIBUTE_PACKED unicode_index_to_name[~D] = {~%"
312 (length all-chars)
314 (dolist (uc (sort (copy-list all-chars) #'< :key #'unicode-char-index))
315 (format ostream " { 0x~4,'0X, ~D },"
316 (unicode-char-index uc)
317 (unicode-char-word-indices-index uc)
319 (when add-comments
320 (format ostream "~21T/* ~A */" (unicode-char-name uc))
322 (format ostream "~%")
324 (format ostream "};~%")
325 (format ostream "#define UNICODE_CHARNAME_MAX_LENGTH ~D~%"
326 (reduce #'max (mapcar (lambda (uc) (length (unicode-char-name uc))) all-chars-and-aliases))
328 (format ostream "#define UNICODE_CHARNAME_MAX_WORDS ~D~%"
329 (reduce #'max (mapcar (lambda (uc) (length (unicode-char-word-indices uc))) all-chars-and-aliases))
331 (format ostream "static const struct { uint16_t index; uint32_t gap; uint16_t length; } unicode_ranges[~D] = {~%"
332 (length all-ranges))
333 (dolist (range all-ranges)
334 (format ostream " { ~D, ~D, ~D },~%"
335 (range-index range)
336 (- (range-start-code range) (range-index range))
337 (1+ (- (range-end-code range) (range-start-code range))))
339 (format ostream "};~%")
341 ) ) )
343 (main (first *args*) (second *args*) (third *args*))