1 #!/usr
/local
/bin
/clisp -C
3 ;;; Creation of gnulib's uninames.h from the UnicodeData.txt table.
5 ;;; Copyright (C) 2000-2020 Free Software Foundation, Inc.
6 ;;; Written by Bruno Haible <bruno@clisp.org>, 2000-12-28.
8 ;;; This program is free software: you can redistribute it and/or modify
9 ;;; it under the terms of the GNU General Public License as published by
10 ;;; the Free Software Foundation; either version 3 of the License, or
11 ;;; (at your option) any later version.
13 ;;; This program is distributed in the hope that it will be useful,
14 ;;; but WITHOUT ANY WARRANTY; without even the implied warranty of
15 ;;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 ;;; GNU General Public License for more details.
18 ;;; You should have received a copy of the GNU General Public License
19 ;;; along with this program. If not, see <https://www.gnu.org/licenses/>.
21 (defparameter add-comments nil
)
23 (defstruct unicode-char
24 (index nil
:type integer
)
25 (name nil
:type string
)
31 (index nil
:type integer
)
32 (start-code nil
:type integer
)
33 (end-code nil
:type integer
)
37 (hashed nil
:type hash-table
)
38 (sorted nil
:type list
)
39 size
; number of characters total
40 length
; number of words
43 (defun main (inputfile outputfile aliasfile
)
44 (declare (type string inputfile outputfile aliasfile
))
45 #+UNICODE
(setq *default-file-encoding
* charset
:utf-8
)
47 (all-chars-hashed (make-hash-table :test
#'equal
))
53 ;; Read all characters and names from the input file.
54 (with-open-file (istream inputfile
:direction
:input
)
56 (let ((line (read-line istream nil nil
)))
57 (unless line
(return))
58 (let* ((i1 (position #\
; line))
59 (i2 (position #\
; line :start (1+ i1)))
60 (code-string (subseq line
0 i1
))
61 (code (parse-integer code-string
:radix
16))
62 (name-string (subseq line
(1+ i1
) i2
)))
63 ; Ignore characters whose name starts with "<".
64 (unless (eql (char name-string
0) #\
<)
65 ; Also ignore Hangul syllables; they are treated specially.
66 (unless (<= #xAC00 code
#xD7A3
)
67 ; Also ignore CJK compatibility ideographs; they are treated
69 (unless (or (<= #xF900 code
#xFA2D
) (<= #xFA30 code
#xFA6A
)
70 (<= #xFA70 code
#xFAD9
) (<= #x2F800 code
#x2FA1D
))
71 ;; Also ignore variationselectors; they are treated
73 (unless (or (<= #xFE00 code
#xFE0F
) (<= #xE0100 code
#xE01EF
))
74 (push (make-unicode-char :index name-index
77 (setf (gethash code all-chars-hashed
) (car all-chars
))
78 ;; Update the contiguous range, or start a new range.
79 (if (and range
(= (1+ (range-end-code range
)) code
))
80 (setf (range-end-code range
) code
)
83 (push range all-ranges
))
84 (setq range
(make-range :index name-index
91 (setq all-chars
(nreverse all-chars
))
93 (push range all-ranges
))
94 (setq all-ranges
(nreverse all-ranges
))
96 ;; Read all characters and names from the alias file.
97 (with-open-file (istream aliasfile
:direction
:input
)
99 (let ((line (read-line istream nil nil
)))
100 (unless line
(return))
101 (let* ((i1 (position #\
; line))
102 (i2 (position #\
; line :start (1+ i1)))
103 (code-string (subseq line
0 i1
))
104 (code (parse-integer code-string
:radix
16))
105 (name-string (subseq line
(1+ i1
) i2
))
106 (uc (gethash code all-chars-hashed
)))
108 (push (make-unicode-char :index
(unicode-char-index uc
)
112 (setq all-aliases
(nreverse all-aliases
)
113 all-chars-and-aliases
(append all-chars all-aliases
))
115 (let ((words-by-length (make-array 0 :adjustable t
)))
116 (dolist (name (list* "HANGUL SYLLABLE" "CJK COMPATIBILITY" "VARIATION"
117 (mapcar #'unicode-char-name all-chars-and-aliases
)))
120 (when (>= i1
(length name
)) (return))
121 (let ((i2 (or (position #\Space name
:start i1
) (length name
))))
122 (let* ((word (subseq name i1 i2
))
124 (when (>= len
(length words-by-length
))
125 (adjust-array words-by-length
(1+ len
))
127 (unless (aref words-by-length len
)
128 (setf (aref words-by-length len
)
130 :hashed
(make-hash-table :test
#'equal
)
133 (let ((word-list (aref words-by-length len
)))
134 (unless (gethash word
(word-list-hashed word-list
))
135 (setf (gethash word
(word-list-hashed word-list
)) t
)
136 (push word
(word-list-sorted word-list
))
141 ;; Sort the word lists.
142 (dotimes (len (length words-by-length
))
143 (unless (aref words-by-length len
)
144 (setf (aref words-by-length len
)
146 :hashed
(make-hash-table :test
#'equal
)
149 (let ((word-list (aref words-by-length len
)))
150 (setf (word-list-sorted word-list
)
151 (sort (word-list-sorted word-list
) #'string
<)
153 (setf (word-list-size word-list
)
154 (reduce #'+ (mapcar #'length
(word-list-sorted word-list
)))
156 (setf (word-list-length word-list
)
157 (length (word-list-sorted word-list
))
159 ;; Output the tables.
160 (with-open-file (ostream outputfile
:direction
:output
161 #+UNICODE
:external-format
#+UNICODE charset
:ascii
)
162 (format ostream
"/* DO NOT EDIT! GENERATED AUTOMATICALLY! */~%")
163 (format ostream
"/*~%")
164 (format ostream
" * ~A~%" (file-namestring outputfile
))
165 (format ostream
" *~%")
166 (format ostream
" * Unicode character name table.~%")
167 (format ostream
" * Generated automatically by the gen-uninames utility.~%")
168 (format ostream
" */~%")
169 (format ostream
"~%")
170 (format ostream
"static const char unicode_name_words[~D] = {~%"
172 (dotimes (len (length words-by-length
))
173 (let ((word-list (aref words-by-length len
)))
174 (incf sum
(word-list-size word-list
))
178 (dotimes (len (length words-by-length
))
179 (let ((word-list (aref words-by-length len
)))
180 (dolist (word (word-list-sorted word-list
))
181 (format ostream
" ~{ '~C',~}~%" (coerce word
'list
))
183 (format ostream
"};~%")
184 (format ostream
"#define UNICODE_CHARNAME_NUM_WORDS ~D~%"
186 (dotimes (len (length words-by-length
))
187 (let ((word-list (aref words-by-length len
)))
188 (incf sum
(word-list-length word-list
))
193 (format ostream
"static const uint16_t unicode_name_word_offsets[~D] = {~%"
195 (dotimes (len (length words-by-length
))
196 (let ((word-list (aref words-by-length len
)))
197 (incf sum
(word-list-length word-list
))
201 (dotimes (len (length words-by-length
))
202 (let ((word-list (aref words-by-length len
)))
203 (when (word-list-sorted word-list
)
205 (do ((l (word-list-sorted word-list
) (cdr l
))
206 (offset 0 (+ offset
(length (car l
)))))
208 (format ostream
"~<~% ~0,79:; ~D,~>" offset
)
210 (format ostream
"~%")
212 (format ostream
"};~%")
214 (format ostream
"static const struct { uint32_t extra_offset; uint16_t ind_offset; } unicode_name_by_length[~D] = {~%"
215 (1+ (length words-by-length
))
217 (let ((extra-offset 0)
219 (dotimes (len (length words-by-length
))
220 (let ((word-list (aref words-by-length len
)))
221 (format ostream
" { ~D, ~D },~%" extra-offset ind-offset
)
222 (incf extra-offset
(word-list-size word-list
))
223 (incf ind-offset
(word-list-length word-list
))
225 (format ostream
" { ~D, ~D }~%" extra-offset ind-offset
)
227 (format ostream
"};~%")
228 (let ((ind-offset 0))
229 (dotimes (len (length words-by-length
))
230 (let ((word-list (aref words-by-length len
)))
231 (dolist (word (word-list-sorted word-list
))
232 (setf (gethash word
(word-list-hashed word-list
)) ind-offset
)
235 (dolist (word '("HANGUL" "SYLLABLE" "CJK" "COMPATIBILITY" "VARIATION"))
236 (format ostream
"#define UNICODE_CHARNAME_WORD_~A ~D~%" word
237 (gethash word
(word-list-hashed (aref words-by-length
(length word
))))
239 ;; Compute the word-indices for every unicode-char.
240 (dolist (uc all-chars-and-aliases
)
241 (let ((name (unicode-char-name uc
))
245 (when (>= i1
(length name
)) (return))
246 (let ((i2 (or (position #\Space name
:start i1
) (length name
))))
247 (let* ((word (subseq name i1 i2
))
249 (push (gethash word
(word-list-hashed (aref words-by-length len
)))
255 (setf (unicode-char-word-indices uc
)
256 (coerce (nreverse indices
) 'vector
)
259 ;; Sort the list of unicode-chars by word-indices.
260 (setq all-chars-and-aliases
261 (sort all-chars-and-aliases
263 (let ((len1 (length vec1
))
264 (len2 (length vec2
)))
269 (cond ((< (aref vec1 i
) (aref vec2 i
)) (return t
))
270 ((> (aref vec1 i
) (aref vec2 i
)) (return nil
))
276 :key
#'unicode-char-word-indices
278 ;; Output the word-indices.
279 (format ostream
"static const uint16_t unicode_names[~D] = {~%"
280 (reduce #'+ (mapcar (lambda (uc) (length (unicode-char-word-indices uc
))) all-chars-and-aliases
))
283 (dolist (uc all-chars-and-aliases
)
284 (format ostream
" ~{ ~D,~}"
285 (maplist (lambda (r) (+ (* 2 (car r
)) (if (cdr r
) 1 0)))
286 (coerce (unicode-char-word-indices uc
) 'list
)
290 (format ostream
"~40T/* ~A */" (unicode-char-name uc
))
292 (format ostream
"~%")
293 (setf (unicode-char-word-indices-index uc
) i
)
294 (incf i
(length (unicode-char-word-indices uc
)))
296 (format ostream
"};~%")
297 (format ostream
"static const struct { uint16_t index; uint32_t name:24; } ATTRIBUTE_PACKED unicode_name_to_index[~D] = {~%"
298 (length all-chars-and-aliases
)
300 (dolist (uc all-chars-and-aliases
)
301 (format ostream
" { 0x~4,'0X, ~D },"
302 (unicode-char-index uc
)
303 (unicode-char-word-indices-index uc
)
306 (format ostream
"~21T/* ~A */" (unicode-char-name uc
))
308 (format ostream
"~%")
310 (format ostream
"};~%")
311 (format ostream
"static const struct { uint16_t index; uint32_t name:24; } ATTRIBUTE_PACKED unicode_index_to_name[~D] = {~%"
314 (dolist (uc (sort (copy-list all-chars
) #'< :key
#'unicode-char-index
))
315 (format ostream
" { 0x~4,'0X, ~D },"
316 (unicode-char-index uc
)
317 (unicode-char-word-indices-index uc
)
320 (format ostream
"~21T/* ~A */" (unicode-char-name uc
))
322 (format ostream
"~%")
324 (format ostream
"};~%")
325 (format ostream
"#define UNICODE_CHARNAME_MAX_LENGTH ~D~%"
326 (reduce #'max
(mapcar (lambda (uc) (length (unicode-char-name uc
))) all-chars-and-aliases
))
328 (format ostream
"#define UNICODE_CHARNAME_MAX_WORDS ~D~%"
329 (reduce #'max
(mapcar (lambda (uc) (length (unicode-char-word-indices uc
))) all-chars-and-aliases
))
331 (format ostream
"static const struct { uint16_t index; uint32_t gap; uint16_t length; } unicode_ranges[~D] = {~%"
333 (dolist (range all-ranges
)
334 (format ostream
" { ~D, ~D, ~D },~%"
336 (- (range-start-code range
) (range-index range
))
337 (1+ (- (range-end-code range
) (range-start-code range
))))
339 (format ostream
"};~%")
343 (main (first *args
*) (second *args
*) (third *args
*))